Project Objective

The AirQuality dataset in R contains daily air quality measurements collected in New York City from May to September 1973. The dataset includes variables such as ozone concentration, solar radiation, wind speed, and daily temperature. In this analysis, we focus on predicting temperature, which is a key climatic variable with strong implications for environmental studies, health impacts, and energy demand forecasting. We apply two machine learning models: Random Forest, a powerful ensemble method that captures complex, non-linear relationships between predictors, and Neural Network (shallow), which provides an alternative regression approach by simulating interconnected neurons. By comparing the two models, we can assess their predictive performance and understand the relative importance of different meteorological variables in explaining temperature variations during this historical air quality study.

Packages

The following packages are required for the analysis. Install them using: install.packages('package_name').

library(randomForest)
library(neuralnet)

Data Preparation

The AirQuality dataset is loaded, missing values are removed, and the temperature variable is renamed for clarity. A simple plot of daily temperature is generated.

data("airquality")
data1 <- na.omit(airquality)
colnames(data1)[which(names(data1)=="Temp")] <- "temperature"

# Plot daily temperature
plot(data1$temperature, type="b", lwd=1.7, col="red",
     main="Daily Temperature (New York, May–Sep 1973)",
     xlab="Days", ylab="Temperature (°F)")

Data Normalization

Data is normalized to a 0-1 scale for the neural network model, as it requires normalized inputs.

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}
maxmindf <- as.data.frame(lapply(data1, normalize))

Train/Test Split

The dataset is split into training (70%) and test (30%) sets for model evaluation.

train_index <- 1:floor(0.7 * nrow(data1))
test_index <- (floor(0.7 * nrow(data1)) + 1):nrow(data1)

# Random Forest (non-normalized)
trainingdata <- data1[train_index, ]
validationdata <- data1[test_index, ]

# Neural Network (normalized)
trainset <- maxmindf[train_index, ]
testset <- maxmindf[test_index, ]

Random Forest Model

A Random Forest model is trained on the non-normalized training data, and predictions are made on the test set. Variable importance is visualized.

fit_rf <- randomForest(temperature ~ ., data=trainingdata)
print(fit_rf)

## 
## Call:
##  randomForest(formula = temperature ~ ., data = trainingdata) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 21.90291
##                     % Var explained: 74.5

plot(fit_rf)

predictions_rf <- predict(fit_rf, validationdata)

results_rf <- data.frame(actual = validationdata$temperature,
                         prediction = predictions_rf)
head(results_rf)

##     actual prediction
## 118     86   84.55155
## 120     97   83.20461
## 121     94   83.78017
## 122     96   83.96206
## 123     94   83.78117
## 124     91   84.43108

# Variable importance
fitRF <- randomForest(temperature ~ ., data=trainingdata, importance=TRUE)
varImpPlot(fitRF, main = "Variable Importance (Random Forest)")

Neural Network Model

A shallow neural network with two hidden layers (2 and 1 neurons) is trained on the normalized training data. Predictions are made on the test set and rescaled to the original temperature scale.

nn <- neuralnet(temperature ~ Solar.R + Wind + Ozone + Month + Day,
                data=trainset,
                hidden=c(2,1),
                linear.output=TRUE,
                threshold=0.01)
plot(nn)
print(nn$result.matrix)

##                                   [,1]
## error                      0.356755110
## reached.threshold          0.008118126
## steps                    241.000000000
## Intercept.to.1layhid1     -1.265484415
## Solar.R.to.1layhid1        0.302884042
## Wind.to.1layhid1          -1.499481743
## Ozone.to.1layhid1          8.486858063
## Month.to.1layhid1         -1.604386443
## Day.to.1layhid1           -0.550632108
## Intercept.to.1layhid2     -2.112932905
## Solar.R.to.1layhid2        0.996853963
## Wind.to.1layhid2           1.215353753
## Ozone.to.1layhid2         -1.459230749
## Month.to.1layhid2         10.142140778
## Day.to.1layhid2            0.884386817
## Intercept.to.2layhid1      0.847105985
## 1layhid1.to.2layhid1      -1.947796749
## 1layhid2.to.2layhid1      -1.943769436
## Intercept.to.temperature   0.844277800
## 2layhid1.to.temperature   -1.476804401

# Predictions
nn_test <- subset(testset, select = c("Solar.R","Wind","Ozone","Month","Day"))
# nn_results <- compute
nn_results <- neuralnet::compute(nn, nn_test)


results_nn <- data.frame(actual = testset$temperature,
                         prediction = nn_results$net.result)

# De-normalize predictions
temp_min <- min(data1$temperature)
temp_max <- max(data1$temperature)
nn_pred_rescaled <- results_nn$prediction * (temp_max - temp_min) + temp_min

head(nn_pred_rescaled, 10)

##  [1] 85.32858 85.25172 87.86246 86.51969 86.49958 87.22438 86.32930 86.16687
##  [9] 87.13636 81.20172

Model Evaluation

The performance of both models is evaluated using Mean Absolute Error (MAE) and Root Mean Squared Error (RMSE) on the test set.

# Random Forest
mae_rf <- mean(abs(results_rf$actual - results_rf$prediction))
rmse_rf <- sqrt(mean((results_rf$actual - results_rf$prediction)^2))
mae_rf

## [1] 6.094794

rmse_rf

## [1] 7.118229

# Neural Network (normalized scale)
mae_nn <- mean(abs(results_nn$actual - results_nn$prediction))
rmse_nn <- sqrt(mean((results_nn$actual - results_nn$prediction)^2))
mae_nn

## [1] 0.1374343

rmse_nn

## [1] 0.1637118

# Neural Network (original scale)
mae_nn_real <- mean(abs(results_rf$actual - nn_pred_rescaled))
rmse_nn_real <- sqrt(mean((results_rf$actual - nn_pred_rescaled)^2))
mae_nn_real

## [1] 5.497371

rmse_nn_real

## [1] 6.54847

Visualization of Observed vs Predicted

A plot compares observed temperatures with predictions from both Random Forest and Neural Network models.

days_test <- 1:nrow(validationdata)

plot(days_test, validationdata$temperature, type="l", col="black", lwd=2,
     ylim = range(c(validationdata$temperature, predictions_rf, nn_pred_rescaled)),
     xlab="Index (Test Data)", ylab="Temperature (°F)",
     main="Observed vs Predicted Temperature", xaxt="n", yaxt="n")
axis(1, tck=0.02, lwd=2)
axis(2, tck=0.02, lwd=2)
box(lwd=2)
grid(lwd=1.5, col="grey")

lines(days_test, predictions_rf, col="blue", lwd=2, lty=1)
points(days_test, predictions_rf, col="blue", pch=7, cex=1.5)

lines(days_test, nn_pred_rescaled, col="red", lwd=2, lty=1)
points(days_test, nn_pred_rescaled, col="red", pch=2, cex=1.5)

legend("topright", legend=c("Observed", "Random Forest", "Neural Network"),
       col=c("black","blue","red"), lty=c(1,2,3), lwd=2)

contact

author: “Abdi-Basid ADAN” email: “abdi-basid@outlook.com”

Machine Learning Approaches for Predicting Temperature using the New York AirQuality Dataset (May–September 1973)

Abdi-Basid ADAN