library(solarf,quietly = TRUE)
library(knitr,quietly = TRUE)
library(ggplot2,quietly = TRUE)
library(klaR,quietly= TRUE) #Naive Bayes Library
library(GGally)
library(ggplot2)
library(plyr)
library(zoo)
#Theme for Plot
science_theme = theme(panel.background=element_blank(), panel.grid.major = element_line(size = 0.5, color = "grey"), 
    axis.line = element_line(size = 0.7, color = "black"), legend.position = c(0.9, 
        0.7), text = element_text(size = 14))

#Load data
#C0875 <- readRDS("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875.rds")
C0875 <- read.csv("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875_2011_2014.csv")

station.vectorize.hour <- function(station.dataframe,variable){
    library(reshape2)
    use.only.last.ocurrence <- function(minute.obs){
        return (minute.obs[1])
    }
    
    station.dataframe.vectorized <- dcast(data=station.dataframe, formula=YEAR + MON + DAY + HR ~ MIN,value.var = variable,fun.aggregate=use.only.last.ocurrence)
    return (station.dataframe.vectorized)
}

C0875.vector.h <- station.vectorize.hour(C0875,"SOLR")
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h$SOLR <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins. 
C0875.h$HR <- C0875.h$HR + 1

C0875.vector <- station.vectorize(C0875.h,"SOLR")

Vectorize Step

Vectorize C0875 over the training and test partitions.

#C0875$SOLR <- C0875$SOLR
#C0875.vector <- station.vectorize(C0875,"SOLR")
C0875.vector <- C0875.vector[,c(1:3,11:20)]

Data Point Filtering

#C0875.vector.filter <- station.vectorize(C0875[!is.na(C0875$SOLR),],"SOLR")
#C0875.vector.filter <- C0875.vector.filter[complete.cases(C0875.vector.filter),c(1:3,12:21)]
C0875.vector.filter <- C0875.vector[complete.cases(C0875.vector),]

Train and Test Split

#Separate the year of 2014 for prediction testing 
C0875.vector.filter.test <- C0875.vector.filter[C0875.vector.filter$YEAR==2014,]
C0875.vector.filter.train <- C0875.vector.filter[C0875.vector.filter$YEAR!=2014,]

#C0875.vector.filter.train <- C0875.vector.filter[C0875.vector.filter$MON==11 |
#                                                     C0875.vector.filter$MON==12 |
#                                                     C0875.vector.filter$MON==1 |
#                                                     C0875.vector.filter$MON==2 |
#                                                     C0875.vector.filter$MON==3 |
#                                                     C0875.vector.filter$MON==4
#                                                     ,]

Cluster Step

Cluster Train Dataset

Cluster with K-means = nclusters the training dataset and assign clusters to the daily profiles.

nclusters <- 5
set.seed(1234)
C0875.filter.train.clusters <- kmeans(C0875.vector.filter.train[,4:ncol(C0875.vector.filter)],nclusters)
C0875.vector.filter.train$clusterid <- C0875.filter.train.clusters[[1]]
C0875.vector.filter.train.centroids <- C0875.filter.train.clusters[[2]]

intensity.crescent.ordered.clusterid <- as.numeric(names(sort(rowSums(C0875.vector.filter.train.centroids))))
#Order the centroids in crescent order of ipntensity and reset rows
C0875.vector.filter.train.centroids <- C0875.vector.filter.train.centroids[intensity.crescent.ordered.clusterid,]
rownames(C0875.vector.filter.train.centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
C0875.vector.filter.train$clusterid <- intensity.crescent.ordered.clusterid[C0875.vector.filter.train$clusterid]

Centroids

centroids_plot <- data.frame(C0875.vector.filter.train.centroids)
centroids_plot$color <-rownames(C0875.vector.filter.train.centroids)
colnames(centroids_plot) <- c(colnames(C0875.vector.filter.train.centroids),"Profile")
kable(C0875.vector.filter.train.centroids)
8 9 10 11 12 13 14 15 16 17
63.75098 169.7902 273.9124 292.8852 312.6332 334.7674 341.3711 292.0897 236.2965 144.7556
85.89587 248.3484 427.8448 571.5542 629.9225 636.4579 573.0717 473.9575 343.4200 200.5349
95.86391 251.6029 441.4603 599.4685 742.0340 820.2648 836.0977 750.7302 556.1624 328.4749
196.82671 388.6677 599.1102 755.9045 814.2504 825.1088 758.6555 605.5889 455.8949 266.5185
207.56872 417.2371 629.2724 825.2460 954.5118 981.7954 940.3175 863.0599 680.0826 452.6717
ggparcoord(
  data = centroids_plot,
  columns=c(1:ncol(C0875.vector.filter.train.centroids)),
  scale="globalminmax",
  alphaLines=0.2,
  groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
  ylab("Solar Radiation Itensity (W/m2") + ggtitle("Solar Radiation Discretized Profiles from 2003 to 2013 of Schofield Barracks Station") + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))

#+ scale_fill_discrete(name="Discretized\ Profile", breaks=c("1", "2", "3","4","5"), labels=c("1 (Overcast)", "2","3","4","5"))# + scale_colour_brewer(palette="Spectral")

Calc. Errors

Results.groundtruth <- station.assign.daily.cluster(C0875.vector.filter.test,C0875.vector.filter.train.centroids)

slide_window <- function(vector,size){
    df <- data.frame(rollapply(vector, width = size, by = 1, FUN = f1<-function(x){return (x)}, align = "left"))      
    return(df[complete.cases(df),])
}


calc_daily_mean_error <- function(train.vector,test.vector,size){

    train <- slide_window(train.vector,size)
    colnames(train) <- c(paste0("Stm",seq(size-1,1)),"St")
    train[] <- lapply(train, as.factor)
    
    test <- slide_window(test.vector,size)
    colnames(test) <- c(paste0("Stm",seq(size-1,1)),"St")
    test[] <- lapply(test, as.factor)
    
    m <- NaiveBayes(St ~ ., data = train)
    forecast <- as.numeric(predict(m,data.frame(test[,1]))$class)
    
    daily.err <- error.absolute.centroid(Results.groundtruth[size:nrow(Results.groundtruth),4:13],forecast,C0875.vector.filter.train.centroids)
    mean.daily.err <- mean(daily.err)
    return (mean.daily.err)
}

Add rows on timestamps that do not occur on the data, and fill them with NAs. This is used so the slide window can verify if the days are consecutive.

C0875.vector.filter.train <- station.add.missing.profiles(C0875.vector.filter.train)
#Amount of Consecutive Days. 2 days <=> forecast using one day before.
err.means <- vector("list",7)
for(i in 2:8){
    err.means[[i-1]] <- calc_daily_mean_error(C0875.vector.filter.train$clusterid,Results.groundtruth$clusterid,i)
}
df.err.means <- as.data.frame(err.means)
colnames(df.err.means) <- c("B1","B2","B3","B4","B5","B6","B7")

labels <- c("B1","B2","B3","B4","B5","B6","B7")
qplot(labels,as.numeric(df.err.means), geom="bar",stat="identity",xlab="Model",ylab="Daily Mean Absolute Error",main="Naive Bayes") + science_theme

kable(df.err.means)
B1 B2 B3 B4 B5 B6 B7
1337.365 1366.718 1405.523 1427.534 1438.938 1444.905 1513.674