Libraries and Data Pre-Processing

#Load Required Libraries
library(solarf)
library(knitr) #For pretty tables on the report using kable 
library(plyr)
library(ggplot2)
library(GGally)
library(caret)
library(extrafont)

#Theme for Plot
science_theme = theme(
    panel.background=element_blank(), 
    #panel.grid.major = element_line(size = 0.5, color = "grey"),
    #axis.line = element_line(size = 0.7, color = "black"), 
    #legend.position = c(0.9, 0.7), 
    #text = element_text(size = 14),
    axis.line=element_blank(),
    text = element_text(size=15, family="Interstate"),
    axis.text.x = element_text(hjust = 0, size=15,color="black"), 
    axis.text.y = element_text(hjust = 0, size=15,color="black"), 
    axis.title.y=element_text(size=15), 
    axis.title.x=element_text(size=15),
    axis.ticks = element_line(colour=NA),
    panel.grid.major = element_line(colour = 'gray', linetype = 'dashed'),
    panel.grid.minor = element_line(colour = NA),  
    panel.background = element_rect(colour = 'white'))

Load Station C0875 (Southeast Oahu Island)

C0875 <- read.csv("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875_2011_2014.csv")

station.vectorize.hour <- function(station.dataframe,variable){
    library(reshape2)
    use.only.last.ocurrence <- function(minute.obs){
        return (minute.obs[1])
    }
    
    station.dataframe.vectorized <- dcast(data=station.dataframe, formula=YEAR + MON + DAY + HR ~ MIN,value.var = variable,fun.aggregate=use.only.last.ocurrence)
    return (station.dataframe.vectorized)
}

Vectorize Step

Vectorize C0875 over the training and test partitions.

#Moves from minute level to hourly level
C0875.vector.h <- station.vectorize.hour(C0875,"SOLR")
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h$MIN <- 0
C0875.h$HR <- C0875.h$HR + 1
C0875.h$SOLR <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins. 

C0875.h <- readRDS("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/SCBH1.rds")
#C0875.h <- C0875.h[C0875.h$YEAR > 2011,]

C0875.vector <- station.vectorize(C0875.h,"SOLR")
C0875.vector <- C0875.vector[,c(1:3,12:21)] #SOLR starts on Feb 15, 2012. Change to 12:21 for schb1

Data Point Filtering

#C0875.vector.filter <- station.vectorize(C0875[!is.na(C0875$SOLR),],"SOLR")
C0875.vector.filter <- C0875.vector[complete.cases(C0875.vector),]

Train and Test Split

#Separate the year of 2014 for prediction testing 
C0875.vector.filter.test <- C0875.vector.filter[C0875.vector.filter$YEAR==2014,]
C0875.vector.filter.train <- C0875.vector.filter[C0875.vector.filter$YEAR!=2014,]

#C0875.vector.filter.train <- C0875.vector.filter[C0875.vector.filter$MON==11 |
#                                                     C0875.vector.filter$MON==12 |
#                                                     C0875.vector.filter$MON==1 |
#                                                     C0875.vector.filter$MON==2 |
#                                                     C0875.vector.filter$MON==3 |
#                                                     C0875.vector.filter$MON==4
#                                                     ,]

Cluster Step

Cluster Train Dataset

Cluster with K-means = nclusters the training dataset and assign clusters to the daily profiles.

nclusters <- 5
set.seed(1234)
C0875.filter.train.clusters <- kmeans(C0875.vector.filter.train[,4:ncol(C0875.vector.filter)],nclusters)
C0875.vector.filter.train$clusterid <- C0875.filter.train.clusters[[1]]
C0875.vector.filter.train.centroids <- C0875.filter.train.clusters[[2]]

intensity.crescent.ordered.clusterid <- as.numeric(names(sort(rowSums(C0875.vector.filter.train.centroids))))
#Order the centroids in crescent order of ipntensity and reset rows
C0875.vector.filter.train.centroids <- C0875.vector.filter.train.centroids[intensity.crescent.ordered.clusterid,]
rownames(C0875.vector.filter.train.centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
C0875.vector.filter.train$clusterid <- intensity.crescent.ordered.clusterid[C0875.vector.filter.train$clusterid]

Centroids

centroids_plot <- data.frame(C0875.vector.filter.train.centroids)
centroids_plot$color <-rownames(C0875.vector.filter.train.centroids)
colnames(centroids_plot) <- c(colnames(C0875.vector.filter.train.centroids),"Profile")
kable(C0875.vector.filter.train.centroids)
8 9 10 11 12 13 14 15 16 17
96.45584 145.7208 184.0028 208.9416 225.6738 235.2094 239.6068 238.3433 203.0014 111.5228
275.89212 431.7226 542.3784 549.3476 472.7911 394.2021 338.9863 254.0959 164.6301 67.0274
171.47927 302.4083 428.9764 542.8919 646.5695 679.3775 633.5432 530.5714 347.3910 149.9687
362.70761 583.1298 739.1073 815.1869 784.6263 689.3010 566.5087 413.2837 266.8270 109.6263
355.95481 572.0663 756.5471 896.4277 969.5854 957.8625 856.8865 690.5565 459.9856 198.4452

Centroids

ggparcoord(
  data = centroids_plot,
  columns=c(1:ncol(C0875.vector.filter.train.centroids)),
  scale="globalminmax",
  alphaLines=0.2,
  groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
  ylab("Solar Irradiance (W/m^2)") + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))

#+ scale_fill_discrete(name="Discretized\ Profile", breaks=c("1", "2", "3","4","5"), labels=c("1 (Overcast)", "2","3","4","5"))# + scale_colour_brewer(palette="Spectral")

Cluster Test Dataset (Ground Truth)

Results.groundtruth <- station.assign.daily.cluster(C0875.vector.filter.test,C0875.vector.filter.train.centroids)

Base Models Creation

Model 1 (J=2)

jointsize <- 2 

Create Argmax Model

#Argmax model creation
C0875.vector.filter.train <- station.add.missing.profiles(C0875.vector.filter.train)
## Warning: package 'zoo' was built under R version 3.1.3
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
C0875.filter.train.joint <- station.joint.probability(C0875.vector.filter.train$clusterid,jointsize)
C0875.argmax.model <- model.argmax(C0875.filter.train.joint)
## hash-2.2.6 provided by Decision Patterns

Perform Prediction

#Prediction using the model
C0875.vector.filter.pred <-station.predict.daily.cluster(station.vector.clustered=Results.groundtruth,model=C0875.argmax.model)

All probabilities for each Prediction

#All probabilities using the model
C0875.filter.train.conditional <- station.conditional.probability(C0875.filter.train.joint)
C0875.vector.filter.All.Probs <- station.clusterid.probability(station.conditional.probability=C0875.filter.train.conditional,clusterid.vector = Results.groundtruth$clusterid)

M1 Outcome Dataframe (Prediction|Support|AllProbs)

Results.m1<-data.frame(
  Pred = C0875.vector.filter.pred$Daym0,
  Support = C0875.vector.filter.pred$Freq,
  All.Probs = C0875.vector.filter.All.Probs,
  Hist = C0875.vector.filter.pred$Hist
)
kable(head(Results.m1))
Pred Support All.Probs Hist
NA NA NA NA
1 482 0.462655601659751 0.0829875518672199 0.0186721991701245 0.282157676348548 0.153526970954357 1
1 482 0.462655601659751 0.0829875518672199 0.0186721991701245 0.282157676348548 0.153526970954357 1
1 482 0.462655601659751 0.0829875518672199 0.0186721991701245 0.282157676348548 0.153526970954357 1
1 482 0.462655601659751 0.0829875518672199 0.0186721991701245 0.282157676348548 0.153526970954357 1
1 482 0.462655601659751 0.0829875518672199 0.0186721991701245 0.282157676348548 0.153526970954357 1

Model 2 (J=3)

jointsize <- 3 

Create Argmax Model

#Argmax model creation
C0875.filter.train.joint <- station.joint.probability(C0875.vector.filter.train$clusterid,jointsize)
C0875.argmax.model <- model.argmax(C0875.filter.train.joint)

Perform Prediction

#Prediction using the model
C0875.vector.filter.pred <-station.predict.daily.cluster(station.vector.clustered=Results.groundtruth,model=C0875.argmax.model)

All probabilities for each Prediction

#All probabilities using the model
C0875.filter.train.conditional <- station.conditional.probability(C0875.filter.train.joint)
C0875.vector.filter.All.Probs <- station.clusterid.probability(station.conditional.probability=C0875.filter.train.conditional,clusterid.vector = Results.groundtruth$clusterid)

M2 Outcome Dataframe (Prediction|Support|AllProbs)

Results.m2<-data.frame(
  Pred = C0875.vector.filter.pred$Daym0,
  Support = C0875.vector.filter.pred$Freq,
  All.Probs = C0875.vector.filter.All.Probs,
  Hist = C0875.vector.filter.pred$Hist
)
kable(head(Results.m2))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
1 207 0.507246376811594 0.072463768115942 0.00966183574879227 0.280193236714976 0.130434782608696 1 1
1 207 0.507246376811594 0.072463768115942 0.00966183574879227 0.280193236714976 0.130434782608696 1 1
1 207 0.507246376811594 0.072463768115942 0.00966183574879227 0.280193236714976 0.130434782608696 1 1
1 207 0.507246376811594 0.072463768115942 0.00966183574879227 0.280193236714976 0.130434782608696 1 1

Model 3 (J=4)

kable(head(Results.m3))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
1 101 0.554455445544554 0.0693069306930693 0.0099009900990099 0.267326732673267 0.099009900990099 1 1 1
1 101 0.554455445544554 0.0693069306930693 0.0099009900990099 0.267326732673267 0.099009900990099 1 1 1
1 101 0.554455445544554 0.0693069306930693 0.0099009900990099 0.267326732673267 0.099009900990099 1 1 1

Model 4 (J=5)

kable(head(Results.m4))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
1 53 0.566037735849057 0.0943396226415094 0.283018867924528 0.0566037735849057 1 1 1 1
1 53 0.566037735849057 0.0943396226415094 0.283018867924528 0.0566037735849057 1 1 1 1

Model 5 (J=6)

kable(head(Results.m5))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
1 29 0.655172413793103 0.137931034482759 0.172413793103448 0.0344827586206897 1 1 1 1 1

Model 6 (J=7)

kable(head(Results.m6))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA

Model 7 (J=8)

kable(head(Results.m7))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA

Entropy Arbtitrer

Minimum entropy selection criteria

Results.decision <- min.entropy.arbitrer(data.frame(
  Results.m1$All.Probs,
  Results.m2$All.Probs,
  Results.m3$All.Probs,  
  Results.m4$All.Probs,  
  Results.m5$All.Probs,
  Results.m6$All.Probs,
  Results.m7$All.Probs  
  ),
  data.frame(
  Results.m1$Support,
  Results.m2$Support,
  Results.m3$Support,  
  Results.m4$Support,  
  Results.m5$Support,
  Results.m6$Support,
  Results.m7$Support
  )
)
Results.decision.no.support <- min.entropy.arbitrer(data.frame(
  Results.m1$All.Probs,
  Results.m2$All.Probs,
  Results.m3$All.Probs,  
  Results.m4$All.Probs,  
  Results.m5$All.Probs,
  Results.m6$All.Probs,
  Results.m7$All.Probs  
  ))

Error Calculation

Ensemble Prediction

Results <- data.frame(
  groundtruth = Results.groundtruth,
  m1 = Results.m1,
  m2 = Results.m2,
  m3 = Results.m3,  
  m4 = Results.m4,  
  m5 = Results.m5,
  m6 = Results.m6,  
  m7 = Results.m7,    
  decision = Results.decision, #Returns Inf if not a single entropy is available due to lack of hist. 
  decision.nosupport = Results.decision.no.support 
)

Output the final predictions using the arbitrer WITH support

Ensemble.Prediction <- data.frame(
  m1.pred = Results.m1$Pred,
  m2.pred = Results.m2$Pred,
  m3.pred = Results.m3$Pred,  
  m4.pred = Results.m4$Pred,  
  m5.pred = Results.m5$Pred,
  m6.pred = Results.m6$Pred,
  m7.pred = Results.m7$Pred,  
  decision = Results.decision$Index
)
Results$arbitrerPred <- Ensemble.Prediction[cbind(1:nrow(Ensemble.Prediction), Ensemble.Prediction$decision)]

Output the final predictions using the arbitrer WITHOUT support

Ensemble.Prediction.no.support <- data.frame(
  m1.pred = Results.m1$Pred,
  m2.pred = Results.m2$Pred,
  m3.pred = Results.m3$Pred,  
  m4.pred = Results.m4$Pred,  
  m5.pred = Results.m5$Pred,
  m6.pred = Results.m6$Pred,
  m7.pred = Results.m7$Pred,  
  decision = Results.decision.no.support$Index
)
Results$arbitrerPredNoSupport <- Ensemble.Prediction.no.support[cbind(1:nrow(Ensemble.Prediction.no.support), Ensemble.Prediction.no.support$decision)]

NA to Most Frequent Centroid

a <- summary(as.factor(C0875.vector.filter.train$clusterid[!is.na(C0875.vector.filter.train$clusterid)]))
base.centroid <- which(a == max(a))
Ensemble.Prediction$m1.pred[is.na(Ensemble.Prediction$m1.pred)] <- base.centroid
Ensemble.Prediction$m2.pred[is.na(Ensemble.Prediction$m2.pred)] <- base.centroid
Ensemble.Prediction$m3.pred[is.na(Ensemble.Prediction$m3.pred)] <- base.centroid
Ensemble.Prediction$m4.pred[is.na(Ensemble.Prediction$m4.pred)] <- base.centroid
Ensemble.Prediction$m5.pred[is.na(Ensemble.Prediction$m5.pred)] <- base.centroid
Ensemble.Prediction$m6.pred[is.na(Ensemble.Prediction$m6.pred)] <- base.centroid
Ensemble.Prediction$m7.pred[is.na(Ensemble.Prediction$m7.pred)] <- base.centroid
Results$arbitrerPred[is.na(Results$arbitrerPred)] <- base.centroid
Results$arbitrerPredNoSupport[is.na(Results$arbitrerPredNoSupport)] <- base.centroid

Individual Errors

Base Model 1 Error

m1.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m1.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m1.error)