library(solarf,quietly = TRUE)
library(knitr,quietly = TRUE)
library(ggplot2,quietly = TRUE)
#Theme for Plot
science_theme = theme(panel.background=element_blank(), panel.grid.major = element_line(size = 0.5, color = "grey"), 
    axis.line = element_line(size = 0.7, color = "black"), legend.position = c(0.9, 
        0.7), text = element_text(size = 14))

#Load data
#SCBH1 <- readRDS("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/SCBH1.rds")
C0875 <- read.csv("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875_2011_2014.csv")

station.vectorize.hour <- function(station.dataframe,variable){
    library(reshape2)
    use.only.last.ocurrence <- function(minute.obs){
        return (minute.obs[1])
    }
    
    station.dataframe.vectorized <- dcast(data=station.dataframe, formula=YEAR + MON + DAY + HR ~ MIN,value.var = variable,fun.aggregate=use.only.last.ocurrence)
    return (station.dataframe.vectorized)
}

C0875.vector.h <- station.vectorize.hour(C0875,"SOLR")
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h$SOLR <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins. 
C0875.h$HR <- C0875.h$HR + 1

vector <- station.vectorize(C0875.h,"SOLR")

Training Dataset

Pre-Processing Routines

#vector <- station.vectorize(SCBH1,"SOLR")
#8am to 5pm
vector <- vector[,c(1:3,11:20)]
#remove days that have at least one missing value at any hour between 8am to 5pm
vector <- vector[complete.cases(vector),]
#2014 as test
test <- vector[vector$YEAR==2014,]
vector <- vector[vector$YEAR!=2014,]

#prepare the input for the linear model. first it verifies if the time difference between two days is 1, then it extract the observation 
#TO-DO Decouple consecutive days logic from preparing the training dataset
prep_train_next_day_hour <- function (data,hour,nDaysBefore,verifyConsecutive=TRUE){
    #Empty Dataframe
    train <- data.frame(matrix(vector(), 0, nDaysBefore*10+1, dimnames=list(c(), c())), stringsAsFactors=F)
    #Covert Timestamp for Time Arithmetic
    timestamps <- ISOdate(year = vector[,"YEAR"], mon = vector[,"MON"], day = vector[,"DAY"],tz="HST")
    j <- 1
    for (i in (nDaysBefore+1):nrow(data)){
        
        #If enabled, verify if the nDaysBefore are all consecutive, only include on training dataset (isTrainingObs <- TRUE) if YES
        isTrainingObs <- TRUE
        if(verifyConsecutive){ 
            isConsecutive = TRUE
            for(k in 0:(nDaysBefore-1)){
                lag <- as.double(difftime(timestamps[i-k],timestamps[i-k-1])) 
                if(lag !=1) isConsecutive = FALSE ; break
            }
            if(!isConsecutive) isTrainingObs <- FALSE
        }
        
        
        #If it is trainingObs add to the training dataset
        if(isTrainingObs){
            #Get the hour column
            hour_index = hour - 4 #Since first hour is 8am, then 8-4 = 4 ; 9-7 = 5..etc. 3 first columns = YEAR,MON,DAY
            today.hour = data[i,hour_index]
            #Get all 'nDaysBefore' vectors before today and concatenate them in a single vector
            for(k in 0:(nDaysBefore - 1)){
                train[j,(1+k*10):(10+k*10)] <- vector[i-(nDaysBefore-1-k)-1,4:13]
            }
            train[j,ncol(train)] <- today.hour            
            #yesterday.profile = as.numeric(data[i-1,4:ncol(data)]) #Dataframe row to vector
            #train[j,] <-  c(yesterday.profile,today.hour)
            j<-j + 1  
        }
    }
    #colnames(train) <- c(paste0("h",seq(8,17)),"y")
    return (train)
}

Create the 10 models training dataset

#Takes 15 minutes to get done or more. 70 Training inputs are created and 70 test datasets with the previous 1 to 7 days. Training data is curated to only contain consecutive days. 

training.datasets <- vector("list",7)
test.datasets <- vector("list",7)

for(size in 1:7){
    training.datasets[[size]] <- vector("list",10)    
    test.datasets[[size]] <- vector("list",10)
}

for(i in 1:10){
    for(size in 1:7){
        training.datasets[[size]][[i]] <- prep_train_next_day_hour(vector,i+7,size,TRUE)
        test.datasets[[size]][[i]] <- prep_train_next_day_hour(test,i+7,size,FALSE)
    }
    
}

#saveRDS(training.datasets,file="~/Desktop/7sizes_10hours_linear_training_C0875.rds")
#saveRDS(test.datasets,file="~/Desktop/7sizes_10hours_linear_test_C0875.rds")
#Loads 70 Linear Models Training data and their 70 respective test datasets. 
#Training.datasets is a list of 7 elements, each of which contain a list of 10 dataframes. 
#The list of 7 elements index the amount of days used for the forecast (e.g index 1 contain 10 linear models each for one hour between (inclusive) 8am to 5pm with 1 day before, index 2 contains 10 training input data for 8am to 10pm, but with 2 days, etc. )
training.datasets <- readRDS("~/Desktop/7sizes_10hours_linear_training_C0875.rds")
test.datasets <- readRDS("~/Desktop/7sizes_10hours_linear_test_C0875.rds")

Hourly Linear Forecast

Train, Prediction and Abs. Errors

#Train 70 models. There is one model for each hour (8 am to 5 pm) and for each hourly model we vary the amount of previous day from 1 to 7. 
hourly.models <- vector("list",7)
predictions <- vector("list",7)
abs.errs <- vector("list",7)
for(size in 1:7){
    hourly.models[[size]] <- vector("list",10)
    predictions[[size]] <- vector("list",10)
    abs.errs[[size]] <- vector("list",10)
}

for(i in 1:10){
    for(size in 1:7){
        y_index <- size*10+1 #always the rightmost element of the dataframe 
        expression <- as.formula(paste0("X",size*10+1,"~",paste(paste0("X",seq(1,size*10)),collapse="+") ))        
        hourly.models[[size]][[i]] <- lm(formula= expression,data=training.datasets[[size]][[i]])
        predictions[[size]][[i]] <- predict(hourly.models[[size]][[i]],test.datasets[[size]][[i]][1:(y_index-1)]) #removes the ground truth, y on the last column and predicts
        abs.errs[[size]][[i]] <- abs(predictions[[size]][[i]] - test.datasets[[size]][[i]][,y_index]) #y is always the most right position on any dataframe of any size  
    }
}

Daily Error Vector

#Each element of the list is a dataframe. Each dataframe contains 10 columns (8 am to 5 pm) and each row is a day. As such, each element is the error of the forecast on a given day at a given hour. 
n_previous_days_error <- vector("list",7)
mean_errors <- vector(mode="numeric",length=7)
for(size in 1:7){
    n_previous_days_error[[size]] <- data.frame(abs.errs[size])
    colnames(n_previous_days_error[[size]]) <- paste0("h",seq(8,17))
    mean_errors[size] <- mean(rowSums(n_previous_days_error[[size]]))
}

#daily.err <- as.data.frame(abs.errs)
#colnames(daily.err) <- paste0("h",seq(8,17))
#hist(rowSums(daily.err),xlab="Daily Error",ylab="Frequency",main="Histogram of Daily Error")
labels <- c("B1","B2","B3","B4","B5","B6","B7")
qplot(labels,mean_errors, geom="bar",stat="identity",xlab="Model",ylab="Daily Mean Absolute Error",main="Per Next Day Hour Model Linear Regression") + science_theme

table <- matrix(mean_errors,ncol=7)
colnames(table) <-labels
kable(table)
B1 B2 B3 B4 B5 B6 B7
1802.098 1848.303 1876.069 1890.084 1927.877 1937.804 2000.176