library(solarf,quietly = TRUE)
library(knitr,quietly = TRUE)
library(ggplot2,quietly = TRUE)
library(klaR,quietly= TRUE) #Naive Bayes Library
library(GGally)
library(ggplot2)
library(plyr)
library(zoo)
#Theme for Plot
science_theme = theme(panel.background=element_blank(), panel.grid.major = element_line(size = 0.5, color = "grey"), 
    axis.line = element_line(size = 0.7, color = "black"), legend.position = c(0.9, 
        0.7), text = element_text(size = 14))

#Load data
SCBH1 <- readRDS("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/SCBH1.rds")

Vectorize Step

Vectorize SCBH1 over the training and test partitions.

SCBH1$SOLR <- SCBH1$SOLR
SCBH1.vector <- station.vectorize(SCBH1,"SOLR")
SCBH1.vector <- SCBH1.vector[,c(1:3,12:21)]

Data Point Filtering

SCBH1.vector.filter <- station.vectorize(SCBH1[!is.na(SCBH1$SOLR),],"SOLR")
SCBH1.vector.filter <- SCBH1.vector.filter[complete.cases(SCBH1.vector.filter),c(1:3,12:21)]

Train and Test Split

#Separate the year of 2014 for prediction testing 
SCBH1.vector.filter.test <- SCBH1.vector.filter[SCBH1.vector.filter$YEAR==2014,]
SCBH1.vector.filter.train <- SCBH1.vector.filter[SCBH1.vector.filter$YEAR!=2014,]

#SCBH1.vector.filter.train <- SCBH1.vector.filter[SCBH1.vector.filter$MON==11 |
#                                                     SCBH1.vector.filter$MON==12 |
#                                                     SCBH1.vector.filter$MON==1 |
#                                                     SCBH1.vector.filter$MON==2 |
#                                                     SCBH1.vector.filter$MON==3 |
#                                                     SCBH1.vector.filter$MON==4
#                                                     ,]

Cluster Step

Cluster Train Dataset

Cluster with K-means = nclusters the training dataset and assign clusters to the daily profiles.

nclusters <- 5
set.seed(1234)
SCBH1.filter.train.clusters <- kmeans(SCBH1.vector.filter.train[,4:ncol(SCBH1.vector.filter)],nclusters)
SCBH1.vector.filter.train$clusterid <- SCBH1.filter.train.clusters[[1]]
SCBH1.vector.filter.train.centroids <- SCBH1.filter.train.clusters[[2]]

intensity.crescent.ordered.clusterid <- as.numeric(names(sort(rowSums(SCBH1.vector.filter.train.centroids))))
#Order the centroids in crescent order of ipntensity and reset rows
SCBH1.vector.filter.train.centroids <- SCBH1.vector.filter.train.centroids[intensity.crescent.ordered.clusterid,]
rownames(SCBH1.vector.filter.train.centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
SCBH1.vector.filter.train$clusterid <- intensity.crescent.ordered.clusterid[SCBH1.vector.filter.train$clusterid]

Centroids

centroids_plot <- data.frame(SCBH1.vector.filter.train.centroids)
centroids_plot$color <-rownames(SCBH1.vector.filter.train.centroids)
colnames(centroids_plot) <- c(colnames(SCBH1.vector.filter.train.centroids),"Profile")
kable(SCBH1.vector.filter.train.centroids)
8 9 10 11 12 13 14 15 16 17
93.25782 143.7437 182.2846 205.3249 220.7571 234.0164 240.3055 239.8390 206.2414 113.76751
261.61291 416.2848 518.5012 533.2761 475.8101 406.1173 349.1187 262.7002 173.7578 70.17312
174.31396 304.4604 433.1721 561.7284 656.6293 688.9423 656.3423 555.5505 366.2306 156.37477
365.70480 581.6513 741.5203 808.6900 769.6827 679.7417 555.8782 407.4649 265.5332 109.94280
365.05443 583.3926 761.2091 898.6913 966.7140 958.8854 855.4969 679.2687 450.2462 193.17814
ggparcoord(
  data = centroids_plot,
  columns=c(1:ncol(SCBH1.vector.filter.train.centroids)),
  scale="globalminmax",
  alphaLines=0.2,
  groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
  ylab("Solar Radiation Itensity (W/m2") + ggtitle("Solar Radiation Discretized Profiles from 2003 to 2013 of Schofield Barracks Station") + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))

#+ scale_fill_discrete(name="Discretized\ Profile", breaks=c("1", "2", "3","4","5"), labels=c("1 (Overcast)", "2","3","4","5"))# + scale_colour_brewer(palette="Spectral")

Calc. Errors

Results.groundtruth <- station.assign.daily.cluster(SCBH1.vector.filter.test,SCBH1.vector.filter.train.centroids)

slide_window <- function(vector,size){
    df <- data.frame(rollapply(vector, width = size, by = 1, FUN = f1<-function(x){return (x)}, align = "left"))      
    return(df)
}


calc_daily_mean_error <- function(train.vector,test.vector,size){

    train <- slide_window(train.vector,size)
    colnames(train) <- c(paste0("Stm",seq(size-1,1)),"St")
    train[] <- lapply(train, as.factor)
    
    test <- slide_window(test.vector,size)
    colnames(test) <- c(paste0("Stm",seq(size-1,1)),"St")
    test[] <- lapply(test, as.factor)
    
    m <- NaiveBayes(St ~ ., data = train)
    forecast <- as.numeric(predict(m,data.frame(test[,1]))$class)
    
    daily.err <- error.absolute.centroid(Results.groundtruth[size:nrow(Results.groundtruth),4:13],forecast,SCBH1.vector.filter.train.centroids)
    mean.daily.err <- mean(daily.err)
    return (mean.daily.err)
}
#Amount of Consecutive Days. 2 days <=> forecast using one day before.
err.means <- vector("list",7)
for(i in 2:8){
    err.means[[i-1]] <- calc_daily_mean_error(SCBH1.vector.filter.train$clusterid,Results.groundtruth$clusterid,i)
}
df.err.means <- as.data.frame(err.means)
colnames(df.err.means) <- c("B1","B2","B3","B4","B5","B6","B7")

labels <- c("B1","B2","B3","B4","B5","B6","B7")
qplot(labels,as.numeric(df.err.means), geom="bar",stat="identity",xlab="Model",ylab="Daily Mean Absolute Error",main="Naive Bayes") + science_theme

kable(df.err.means)
B1 B2 B3 B4 B5 B6 B7
1612.885 1633.898 1640.278 1694.219 1696.3 1749.063 1675.328