library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(scales)
## Warning: package 'scales' was built under R version 4.4.1
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

Data

These data consist of 4 months’ worth of solar radiation measurements (kWh per m^2) as well as environmental conditions of temperature (deg C), air pressure (in Hg), humidity (%), and wind speed (m/s). The original data were measured every 5 minutes, so I followed the code in the example to summarize them by day, taking the mean, minimum, maximum, standard deviation, and range of each variable. I then rescaled each variable to a 0 to 1 scale for the neural network analysis.

solar <- readRDS("solarPrediction.rds")
head(solar)
## # A tibble: 6 × 8
##   DateTime            Radiation Temperature Pressure Humidity Speed TimeSunRise
##   <dttm>                  <dbl>       <dbl>    <dbl>    <dbl> <dbl> <time>     
## 1 2016-09-30 00:55:26      1.21          48     30.5       59  5.62 06:13      
## 2 2016-09-30 00:50:23      1.21          48     30.5       58  3.37 06:13      
## 3 2016-09-30 00:45:26      1.23          48     30.5       57  3.37 06:13      
## 4 2016-09-30 00:40:21      1.21          48     30.5       60  3.37 06:13      
## 5 2016-09-30 00:35:24      1.17          48     30.5       62  5.62 06:13      
## 6 2016-09-30 00:30:24      1.21          48     30.5       64  5.62 06:13      
## # ℹ 1 more variable: TimeSunSet <time>
# summarize variables by day with means, minima, maxima, and sd
solarDay <- solar |> 
  group_by(Date = date(DateTime)) |> 
  summarise(kWh_m2 = sum(Radiation * 60 * 5) * 2.77778e-7,
            avgTemperature = mean(Temperature),
            minTemperature = min(Temperature),
            maxTemperature = max(Temperature),
            sdTemperature = sd(Temperature),
            rangeTemperature = maxTemperature - minTemperature,
            avgPressure = mean(Pressure),
            minPressure = min(Pressure),
            maxPressure = max(Pressure),
            sdPressure = sd(Pressure),
            rangeTemperature = maxPressure - minPressure,
            avgHumidity = mean(Humidity),
            minHumidity = min(Humidity),
            maxHumidity = max(Humidity),
            sdPHumidity = sd(Humidity),
            rangeHumidity = maxHumidity - minHumidity,
            avgSpeed = mean(Speed),
            minSpeed = min(Speed),
            maxSpeed = max(Speed),
            sdPSpeed = sd(Speed),
            rangeSpeed = maxSpeed - minSpeed,
            dayLengthHrs = as.numeric(((max(TimeSunSet) - 
                                          min(TimeSunRise)) / 60 / 60)))

head(solarDay)
## # A tibble: 6 × 22
##   Date       kWh_m2 avgTemperature minTemperature maxTemperature sdTemperature
##   <date>      <dbl>          <dbl>          <dbl>          <dbl>         <dbl>
## 1 2016-09-01   6.22           54.7             48             63          4.78
## 2 2016-09-02   7.75           56.3             46             65          5.81
## 3 2016-09-03   2.91           55.6             50             63          3.48
## 4 2016-09-04   3.58           53.7             49             62          3.44
## 5 2016-09-05   7.17           53.2             46             64          6.11
## 6 2016-09-06   7.33           52.1             45             62          5.59
## # ℹ 16 more variables: rangeTemperature <dbl>, avgPressure <dbl>,
## #   minPressure <dbl>, maxPressure <dbl>, sdPressure <dbl>, avgHumidity <dbl>,
## #   minHumidity <dbl>, maxHumidity <dbl>, sdPHumidity <dbl>,
## #   rangeHumidity <dbl>, avgSpeed <dbl>, minSpeed <dbl>, maxSpeed <dbl>,
## #   sdPSpeed <dbl>, rangeSpeed <dbl>, dayLengthHrs <dbl>
solarRescale <- solarDay |>
  select(-Date) |>
  mutate(across(everything(), rescale))

There are numerous Neural Network methods available with train from the caret package. I chose to compare the original method from the example, neuralnet, with two others - Bayesian Regularized Neural Networks brnn, and Monotone Multi-Layer Perceptron Neural Network monmlp.

The neural net method allows me to optimize the number of nodes within each of 3 hidden layers. The Bayesian method appears to only have one hidden layer but allows for modulation of the number of nodes. The Monotone Multi-Layer Perceptron method allows for changing the number of hidden units (presumably the nodes in a single layer) as well as the number of models to run.

I am a little unclear as to the differences between all of the Multi-Layer Perceptron methods, as there are quite a few. From what I researched, the multilayer perceptron is a linear approach to neural networks (there is no cycling of data between hidden layers, as other methods may do), but typically uses non-linear activation functions. Playing around with these three methods and tuning their parameters to the best fit feels like a good place to start with these data. For actual use, I would look into the specific method in greater detail, but here I just compared the three methods.

Neural Networks

Method 1: Neural Net

modelLookup(model = "neuralnet")
##       model parameter                    label forReg forClass probModel
## 1 neuralnet    layer1 #Hidden Units in Layer 1   TRUE    FALSE     FALSE
## 2 neuralnet    layer2 #Hidden Units in Layer 2   TRUE    FALSE     FALSE
## 3 neuralnet    layer3 #Hidden Units in Layer 3   TRUE    FALSE     FALSE

This method has parameters to be tuned: layers 1 through 3 (i.e., three hidden layers within the black box).

solar_nn1 <- train(kWh_m2 ~ ., data = solarRescale, method = "neuralnet")
solar_nn1
## Neural Network 
## 
## 119 samples
##  20 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 119, 119, 119, 119, 119, 119, ... 
## Resampling results across tuning parameters:
## 
##   layer1  RMSE       Rsquared   MAE      
##   1       0.1596439  0.5990287  0.1169080
##   3       0.1579311  0.6456103  0.1163393
##   5       0.1625459  0.6451506  0.1197746
## 
## Tuning parameter 'layer2' was held constant at a value of 0
## Tuning
##  parameter 'layer3' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were layer1 = 3, layer2 = 0 and layer3 = 0.

When I first ran train with this model without any parameter specifications, it held both layers 2 and 3 at values of 0 and assigned one node to layer 1. When I ran it again, it found the lowest RMSE was with 3 nodes in layer 1, though the values were incredibly similar (0.164 vs 0.153). To compare multiple iterations of different layer values, I set up a tuning grid for the train function and ran it again.

# function to restore scaled values to original range of data
unscale <- function(x, maxim, minim) {
  y <- (x * (maxim - minim) + minim)
  return(y)
}
set.seed(2337)
solar_TuneGrid <- expand.grid(layer1 = c(1,3,5), layer2 = 0:2, layer3 = 0:2)

solar_nn2 <- train(kWh_m2 ~ ., data = solarRescale, method = "neuralnet", 
             tuneGrid = solar_TuneGrid)

solar_nn2$bestTune
##    layer1 layer2 layer3
## 27      5      2      2
solar_nn2$results[27,]
##    layer1 layer2 layer3      RMSE  Rsquared       MAE    RMSESD RsquaredSD
## 27      5      2      2 0.1441569 0.6807188 0.1067346 0.0255811  0.1133875
##         MAESD
## 27 0.01489742
unscale(x = solar_nn2$results$MAE[27], 
        maxim = max(solarDay$kWh_m2), minim = min(solarDay$kWh_m2))
## [1] 0.8285801

Running the neural net across all combinations in the tuning grid, the best model used 5 nodes in layer 1, 2 nodes in layer 2, and 2 nodes in layer 3 (RMSE = 0.144, R^2 = 0.68, MAE = 0.828). This fits slightly better than the untuned values I originally found.

Method 2: Bayesian Regularized Neural Networks

modelLookup(model = "brnn")
##   model parameter     label forReg forClass probModel
## 1  brnn   neurons # Neurons   TRUE    FALSE     FALSE

The Bayesian neural network has only one parameter for tuning: the number of neurons. I set up the same process as above to see if it performs any better than the original neural network.

set.seed(2337)
solar_TuneGrid <- expand.grid(neurons = 1:7)

solar_brnn <- train(kWh_m2 ~ ., data = solarRescale, method = "brnn", 
             tuneGrid = solar_TuneGrid, 
             # tried to mute output with verbose = FALSE but no luck with brnn
             trControl = trainControl(verboseIter = FALSE), verbose = FALSE)
solar_brnn$bestTune
##   neurons
## 1       1
solar_brnn$results[1,]
##   neurons      RMSE  Rsquared       MAE     RMSESD RsquaredSD      MAESD
## 1       1 0.1443731 0.6721898 0.1083969 0.01735476 0.07046816 0.01312246
unscale(x = solar_brnn$results$MAE[1], 
        maxim = max(solarDay$kWh_m2), minim = min(solarDay$kWh_m2))
## [1] 0.8414658

The Bayesian Regularized Neural Network does slightly better than the original neural network, tuning to just a single neuron (RMSE = 0.142, R^2 = 0.70, MAE = 0.83).

Method 3: Monotone Multi-Layer Perceptron Neural Network

modelLookup("monmlp")
##    model  parameter         label forReg forClass probModel
## 1 monmlp    hidden1 #Hidden Units   TRUE     TRUE      TRUE
## 2 monmlp n.ensemble       #Models   TRUE     TRUE      TRUE

The Monotone Multi-Layer Perceptron Neural Network has two tunable parameters: the number of hidden units (presumably the number of nodes in the hidden layer), and number of models.

set.seed(2337)

solar_monmlp <- train(kWh_m2 ~ ., data = solarRescale, method = "monmlp")
solar_monmlp$bestTune
##   hidden1 n.ensemble
## 1       1          1
solar_monmlp$results[1,]
##   hidden1 n.ensemble      RMSE Rsquared      MAE     RMSESD RsquaredSD
## 1       1          1 0.1547492 0.634464 0.116093 0.01598455 0.08262711
##        MAESD
## 1 0.01308427
unscale(x = solar_monmlp$results$MAE[1], 
        maxim = max(solarDay$kWh_m2), minim = min(solarDay$kWh_m2))
## [1] 0.9011232

I tried running the above method without any parameter adjustments. Train selected 1, 3 and 5 as values for hidden 1. As with the Bayesian neural network, the optimal model had 1 node (RMSE = 0.161, R^2 = 0.62, MAE = 0.94).

Of all the iterations of neural networks that I ran, the Bayesian Regularized Neural Network was the best model fit to the data with a single neuron (RMSE = 0.142, R^2 = 0.70, MAE = 0.83).

solar_obs <- solarDay |>
  select(Date, kWh_m2, avgTemperature, 
         avgPressure, avgHumidity, avgSpeed) |>
  rename("obs" = "kWh_m2") |>
  pivot_longer(cols = -c(Date, obs))
solar_pred <- predict(solar_brnn, solarRescale[,-1])
solar_pred <- tibble(data.frame(Date = solarDay$Date),
                     pred = unscale(solar_pred, 
                                     maxim = max(solarDay$kWh_m2), 
                                     minim = min(solarDay$kWh_m2)))
solar_pred_long <- solar_pred |>
  inner_join(solar_obs, by = join_by("Date")) |>
  pivot_longer(cols = c(pred, obs), 
               names_to = "obs_pred", values_to = "kWh_m2")

p1 <- ggplot(data = solar_pred_long) +
  geom_point(aes(x = value, y = kWh_m2, color = obs_pred)) +
  geom_path(aes(x = value, y = kWh_m2, group = Date),arrow = arrow(ends = "first", length = unit(0.075, "inches"))) +
  facet_wrap(~name, scales = "free", ncol = 1)
p1