The AirQuality dataset in R contains daily air quality measurements collected in New York City from May to September 1973. The dataset includes variables such as ozone concentration, solar radiation, wind speed, and daily temperature. In this analysis, we focus on predicting temperature, which is a key climatic variable with strong implications for environmental studies, health impacts, and energy demand forecasting. We apply two machine learning models: Random Forest, a powerful ensemble method that captures complex, non-linear relationships between predictors, and Neural Network (shallow), which provides an alternative regression approach by simulating interconnected neurons. By comparing the two models, we can assess their predictive performance and understand the relative importance of different meteorological variables in explaining temperature variations during this historical air quality study.
The following packages are required for the analysis. Install them
using: install.packages('package_name').
library(randomForest)
library(neuralnet)
The AirQuality dataset is loaded, missing values are removed, and the temperature variable is renamed for clarity. A simple plot of daily temperature is generated.
data("airquality")
data1 <- na.omit(airquality)
colnames(data1)[which(names(data1)=="Temp")] <- "temperature"
# Plot daily temperature
plot(data1$temperature, type="b", lwd=1.7, col="red",
main="Daily Temperature (New York, May–Sep 1973)",
xlab="Days", ylab="Temperature (°F)")
Data is normalized to a 0-1 scale for the neural network model, as it requires normalized inputs.
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
maxmindf <- as.data.frame(lapply(data1, normalize))
The dataset is split into training (70%) and test (30%) sets for model evaluation.
train_index <- 1:floor(0.7 * nrow(data1))
test_index <- (floor(0.7 * nrow(data1)) + 1):nrow(data1)
# Random Forest (non-normalized)
trainingdata <- data1[train_index, ]
validationdata <- data1[test_index, ]
# Neural Network (normalized)
trainset <- maxmindf[train_index, ]
testset <- maxmindf[test_index, ]
A Random Forest model is trained on the non-normalized training data, and predictions are made on the test set. Variable importance is visualized.
fit_rf <- randomForest(temperature ~ ., data=trainingdata)
print(fit_rf)
##
## Call:
## randomForest(formula = temperature ~ ., data = trainingdata)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 21.90291
## % Var explained: 74.5
plot(fit_rf)
predictions_rf <- predict(fit_rf, validationdata)
results_rf <- data.frame(actual = validationdata$temperature,
prediction = predictions_rf)
head(results_rf)
## actual prediction
## 118 86 84.55155
## 120 97 83.20461
## 121 94 83.78017
## 122 96 83.96206
## 123 94 83.78117
## 124 91 84.43108
# Variable importance
fitRF <- randomForest(temperature ~ ., data=trainingdata, importance=TRUE)
varImpPlot(fitRF, main = "Variable Importance (Random Forest)")
A shallow neural network with two hidden layers (2 and 1 neurons) is trained on the normalized training data. Predictions are made on the test set and rescaled to the original temperature scale.
nn <- neuralnet(temperature ~ Solar.R + Wind + Ozone + Month + Day,
data=trainset,
hidden=c(2,1),
linear.output=TRUE,
threshold=0.01)
plot(nn)
print(nn$result.matrix)
## [,1]
## error 0.356755110
## reached.threshold 0.008118126
## steps 241.000000000
## Intercept.to.1layhid1 -1.265484415
## Solar.R.to.1layhid1 0.302884042
## Wind.to.1layhid1 -1.499481743
## Ozone.to.1layhid1 8.486858063
## Month.to.1layhid1 -1.604386443
## Day.to.1layhid1 -0.550632108
## Intercept.to.1layhid2 -2.112932905
## Solar.R.to.1layhid2 0.996853963
## Wind.to.1layhid2 1.215353753
## Ozone.to.1layhid2 -1.459230749
## Month.to.1layhid2 10.142140778
## Day.to.1layhid2 0.884386817
## Intercept.to.2layhid1 0.847105985
## 1layhid1.to.2layhid1 -1.947796749
## 1layhid2.to.2layhid1 -1.943769436
## Intercept.to.temperature 0.844277800
## 2layhid1.to.temperature -1.476804401
# Predictions
nn_test <- subset(testset, select = c("Solar.R","Wind","Ozone","Month","Day"))
# nn_results <- compute
nn_results <- neuralnet::compute(nn, nn_test)
results_nn <- data.frame(actual = testset$temperature,
prediction = nn_results$net.result)
# De-normalize predictions
temp_min <- min(data1$temperature)
temp_max <- max(data1$temperature)
nn_pred_rescaled <- results_nn$prediction * (temp_max - temp_min) + temp_min
head(nn_pred_rescaled, 10)
## [1] 85.32858 85.25172 87.86246 86.51969 86.49958 87.22438 86.32930 86.16687
## [9] 87.13636 81.20172
The performance of both models is evaluated using Mean Absolute Error (MAE) and Root Mean Squared Error (RMSE) on the test set.
# Random Forest
mae_rf <- mean(abs(results_rf$actual - results_rf$prediction))
rmse_rf <- sqrt(mean((results_rf$actual - results_rf$prediction)^2))
mae_rf
## [1] 6.094794
rmse_rf
## [1] 7.118229
# Neural Network (normalized scale)
mae_nn <- mean(abs(results_nn$actual - results_nn$prediction))
rmse_nn <- sqrt(mean((results_nn$actual - results_nn$prediction)^2))
mae_nn
## [1] 0.1374343
rmse_nn
## [1] 0.1637118
# Neural Network (original scale)
mae_nn_real <- mean(abs(results_rf$actual - nn_pred_rescaled))
rmse_nn_real <- sqrt(mean((results_rf$actual - nn_pred_rescaled)^2))
mae_nn_real
## [1] 5.497371
rmse_nn_real
## [1] 6.54847
A plot compares observed temperatures with predictions from both Random Forest and Neural Network models.
days_test <- 1:nrow(validationdata)
plot(days_test, validationdata$temperature, type="l", col="black", lwd=2,
ylim = range(c(validationdata$temperature, predictions_rf, nn_pred_rescaled)),
xlab="Index (Test Data)", ylab="Temperature (°F)",
main="Observed vs Predicted Temperature", xaxt="n", yaxt="n")
axis(1, tck=0.02, lwd=2)
axis(2, tck=0.02, lwd=2)
box(lwd=2)
grid(lwd=1.5, col="grey")
lines(days_test, predictions_rf, col="blue", lwd=2, lty=1)
points(days_test, predictions_rf, col="blue", pch=7, cex=1.5)
lines(days_test, nn_pred_rescaled, col="red", lwd=2, lty=1)
points(days_test, nn_pred_rescaled, col="red", pch=2, cex=1.5)
legend("topright", legend=c("Observed", "Random Forest", "Neural Network"),
col=c("black","blue","red"), lty=c(1,2,3), lwd=2)
author: “Abdi-Basid ADAN” email: “abdi-basid@outlook.com”