library(solarf,quietly = TRUE)
library(knitr,quietly = TRUE)
library(ggplot2,quietly = TRUE)
#Theme for Plot
science_theme = theme(panel.background=element_blank(), panel.grid.major = element_line(size = 0.5, color = "grey"),
axis.line = element_line(size = 0.7, color = "black"), legend.position = c(0.9,
0.7), text = element_text(size = 14))
#Load data
#SCBH1 <- readRDS("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/SCBH1.rds")
C0875 <- read.csv("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875_2011_2014.csv")
station.vectorize.hour <- function(station.dataframe,variable){
library(reshape2)
use.only.last.ocurrence <- function(minute.obs){
return (minute.obs[1])
}
station.dataframe.vectorized <- dcast(data=station.dataframe, formula=YEAR + MON + DAY + HR ~ MIN,value.var = variable,fun.aggregate=use.only.last.ocurrence)
return (station.dataframe.vectorized)
}
C0875.vector.h <- station.vectorize.hour(C0875,"SOLR")
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h$SOLR <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins.
C0875.h$HR <- C0875.h$HR + 1
vector <- station.vectorize(C0875.h,"SOLR")
#vector <- station.vectorize(SCBH1,"SOLR")
#8am to 5pm
vector <- vector[,c(1:3,11:20)]
#remove days that have at least one missing value at any hour between 8am to 5pm
vector <- vector[complete.cases(vector),]
#2014 as test
test <- vector[vector$YEAR==2014,]
vector <- vector[vector$YEAR!=2014,]
#prepare the input for the linear model. first it verifies if the time difference between two days is 1, then it extract the observation
#TO-DO Decouple consecutive days logic from preparing the training dataset
prep_train_next_day_hour <- function (data,hour,nDaysBefore,verifyConsecutive=TRUE){
#Empty Dataframe
train <- data.frame(matrix(vector(), 0, nDaysBefore*10+1, dimnames=list(c(), c())), stringsAsFactors=F)
#Covert Timestamp for Time Arithmetic
timestamps <- ISOdate(year = vector[,"YEAR"], mon = vector[,"MON"], day = vector[,"DAY"],tz="HST")
j <- 1
for (i in (nDaysBefore+1):nrow(data)){
#If enabled, verify if the nDaysBefore are all consecutive, only include on training dataset (isTrainingObs <- TRUE) if YES
isTrainingObs <- TRUE
if(verifyConsecutive){
isConsecutive = TRUE
for(k in 0:(nDaysBefore-1)){
lag <- as.double(difftime(timestamps[i-k],timestamps[i-k-1]))
if(lag !=1) isConsecutive = FALSE ; break
}
if(!isConsecutive) isTrainingObs <- FALSE
}
#If it is trainingObs add to the training dataset
if(isTrainingObs){
#Get the hour column
hour_index = hour - 4 #Since first hour is 8am, then 8-4 = 4 ; 9-7 = 5..etc. 3 first columns = YEAR,MON,DAY
today.hour = data[i,hour_index]
#Get all 'nDaysBefore' vectors before today and concatenate them in a single vector
for(k in 0:(nDaysBefore - 1)){
train[j,(1+k*10):(10+k*10)] <- vector[i-(nDaysBefore-1-k)-1,4:13]
}
train[j,ncol(train)] <- today.hour
#yesterday.profile = as.numeric(data[i-1,4:ncol(data)]) #Dataframe row to vector
#train[j,] <- c(yesterday.profile,today.hour)
j<-j + 1
}
}
#colnames(train) <- c(paste0("h",seq(8,17)),"y")
return (train)
}
Create the 10 models training dataset
#Takes 15 minutes to get done or more. 70 Training inputs are created and 70 test datasets with the previous 1 to 7 days. Training data is curated to only contain consecutive days.
training.datasets <- vector("list",7)
test.datasets <- vector("list",7)
for(size in 1:7){
training.datasets[[size]] <- vector("list",10)
test.datasets[[size]] <- vector("list",10)
}
for(i in 1:10){
for(size in 1:7){
training.datasets[[size]][[i]] <- prep_train_next_day_hour(vector,i+7,size,TRUE)
test.datasets[[size]][[i]] <- prep_train_next_day_hour(test,i+7,size,FALSE)
}
}
#saveRDS(training.datasets,file="~/Desktop/7sizes_10hours_linear_training_C0875.rds")
#saveRDS(test.datasets,file="~/Desktop/7sizes_10hours_linear_test_C0875.rds")
#Loads 70 Linear Models Training data and their 70 respective test datasets.
#Training.datasets is a list of 7 elements, each of which contain a list of 10 dataframes.
#The list of 7 elements index the amount of days used for the forecast (e.g index 1 contain 10 linear models each for one hour between (inclusive) 8am to 5pm with 1 day before, index 2 contains 10 training input data for 8am to 10pm, but with 2 days, etc. )
training.datasets <- readRDS("~/Desktop/7sizes_10hours_linear_training_C0875.rds")
test.datasets <- readRDS("~/Desktop/7sizes_10hours_linear_test_C0875.rds")
#Train 70 models. There is one model for each hour (8 am to 5 pm) and for each hourly model we vary the amount of previous day from 1 to 7.
hourly.models <- vector("list",7)
predictions <- vector("list",7)
abs.errs <- vector("list",7)
for(size in 1:7){
hourly.models[[size]] <- vector("list",10)
predictions[[size]] <- vector("list",10)
abs.errs[[size]] <- vector("list",10)
}
for(i in 1:10){
for(size in 1:7){
y_index <- size*10+1 #always the rightmost element of the dataframe
expression <- as.formula(paste0("X",size*10+1,"~",paste(paste0("X",seq(1,size*10)),collapse="+") ))
hourly.models[[size]][[i]] <- lm(formula= expression,data=training.datasets[[size]][[i]])
predictions[[size]][[i]] <- predict(hourly.models[[size]][[i]],test.datasets[[size]][[i]][1:(y_index-1)]) #removes the ground truth, y on the last column and predicts
abs.errs[[size]][[i]] <- abs(predictions[[size]][[i]] - test.datasets[[size]][[i]][,y_index]) #y is always the most right position on any dataframe of any size
}
}
#Each element of the list is a dataframe. Each dataframe contains 10 columns (8 am to 5 pm) and each row is a day. As such, each element is the error of the forecast on a given day at a given hour.
n_previous_days_error <- vector("list",7)
mean_errors <- vector(mode="numeric",length=7)
for(size in 1:7){
n_previous_days_error[[size]] <- data.frame(abs.errs[size])
colnames(n_previous_days_error[[size]]) <- paste0("h",seq(8,17))
mean_errors[size] <- mean(rowSums(n_previous_days_error[[size]]))
}
#daily.err <- as.data.frame(abs.errs)
#colnames(daily.err) <- paste0("h",seq(8,17))
#hist(rowSums(daily.err),xlab="Daily Error",ylab="Frequency",main="Histogram of Daily Error")
labels <- c("B1","B2","B3","B4","B5","B6","B7")
qplot(labels,mean_errors, geom="bar",stat="identity",xlab="Model",ylab="Daily Mean Absolute Error",main="Per Next Day Hour Model Linear Regression") + science_theme
table <- matrix(mean_errors,ncol=7)
colnames(table) <-labels
kable(table)
| B1 | B2 | B3 | B4 | B5 | B6 | B7 |
|---|---|---|---|---|---|---|
| 1802.098 | 1848.303 | 1876.069 | 1890.084 | 1927.877 | 1937.804 | 2000.176 |