Libraries and Data Pre-Processing

#Load Required Libraries
library(solarf)
library(knitr) #For pretty tables on the report using kable 
library(plyr)
library(ggplot2)
library(GGally)
library(caret)

#Theme for Plot
science_theme = theme(panel.background=element_blank(), panel.grid.major = element_line(size = 0.5, color = "grey"), 
    axis.line = element_line(size = 0.7, color = "black"), legend.position = c(0.9, 
        0.7), text = element_text(size = 14))

Load Station C0875 (Southeast Oahu Island)

C0875 <- read.csv("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875_2011_2014.csv")

station.vectorize.hour <- function(station.dataframe,variable){
    library(reshape2)
    use.only.last.ocurrence <- function(minute.obs){
        return (minute.obs[1])
    }
    
    station.dataframe.vectorized <- dcast(data=station.dataframe, formula=YEAR + MON + DAY + HR ~ MIN,value.var = variable,fun.aggregate=use.only.last.ocurrence)
    return (station.dataframe.vectorized)
}

Vectorize Step

Vectorize C0875 over the training and test partitions.

#Moves from minute level to hourly level
C0875.vector.h <- station.vectorize.hour(C0875,"SOLR")
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h$SOLR <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins. 
C0875.h$HR <- C0875.h$HR + 1

C0875.vector <- station.vectorize(C0875.h,"SOLR")
C0875.vector <- C0875.vector[C0875.vector$YEAR>2011,c(1:3,11:20)] #SOLR starts on Feb 15, 2012. 

Data Point Filtering

#C0875.vector.filter <- station.vectorize(C0875[!is.na(C0875$SOLR),],"SOLR")
C0875.vector.filter <- C0875.vector[complete.cases(C0875.vector),]

Train and Test Split

#Separate the year of 2014 for prediction testing 
C0875.vector.filter.test <- C0875.vector.filter[C0875.vector.filter$YEAR==2014,]
C0875.vector.filter.train <- C0875.vector.filter[C0875.vector.filter$YEAR!=2014,]

#C0875.vector.filter.train <- C0875.vector.filter[C0875.vector.filter$MON==11 |
#                                                     C0875.vector.filter$MON==12 |
#                                                     C0875.vector.filter$MON==1 |
#                                                     C0875.vector.filter$MON==2 |
#                                                     C0875.vector.filter$MON==3 |
#                                                     C0875.vector.filter$MON==4
#                                                     ,]

Cluster Step

Cluster Train Dataset

Cluster with K-means = nclusters the training dataset and assign clusters to the daily profiles.

nclusters <- 5
set.seed(1234)
C0875.filter.train.clusters <- kmeans(C0875.vector.filter.train[,4:ncol(C0875.vector.filter)],nclusters)
C0875.vector.filter.train$clusterid <- C0875.filter.train.clusters[[1]]
C0875.vector.filter.train.centroids <- C0875.filter.train.clusters[[2]]

intensity.crescent.ordered.clusterid <- as.numeric(names(sort(rowSums(C0875.vector.filter.train.centroids))))
#Order the centroids in crescent order of ipntensity and reset rows
C0875.vector.filter.train.centroids <- C0875.vector.filter.train.centroids[intensity.crescent.ordered.clusterid,]
rownames(C0875.vector.filter.train.centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
C0875.vector.filter.train$clusterid <- intensity.crescent.ordered.clusterid[C0875.vector.filter.train$clusterid]

Centroids

centroids_plot <- data.frame(C0875.vector.filter.train.centroids)
centroids_plot$color <-rownames(C0875.vector.filter.train.centroids)
colnames(centroids_plot) <- c(colnames(C0875.vector.filter.train.centroids),"Profile")
kable(C0875.vector.filter.train.centroids)
8 9 10 11 12 13 14 15 16 17
63.75098 169.7902 273.9124 292.8852 312.6332 334.7674 341.3711 292.0897 236.2965 144.7556
85.89587 248.3484 427.8448 571.5542 629.9225 636.4579 573.0717 473.9575 343.4200 200.5349
95.86391 251.6029 441.4603 599.4685 742.0340 820.2648 836.0977 750.7302 556.1624 328.4749
196.82671 388.6677 599.1102 755.9045 814.2504 825.1088 758.6555 605.5889 455.8949 266.5185
207.56872 417.2371 629.2724 825.2460 954.5118 981.7954 940.3175 863.0599 680.0826 452.6717

Centroids

ggparcoord(
  data = centroids_plot,
  columns=c(1:ncol(C0875.vector.filter.train.centroids)),
  scale="globalminmax",
  alphaLines=0.2,
  groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
  ylab("Solar Radiation Itensity (W/m2") + ggtitle("Solar Radiation Discretized Profiles from 2003 to 2013 of Schofield Barracks Station") + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))

#+ scale_fill_discrete(name="Discretized\ Profile", breaks=c("1", "2", "3","4","5"), labels=c("1 (Overcast)", "2","3","4","5"))# + scale_colour_brewer(palette="Spectral")

Cluster Test Dataset (Ground Truth)

Results.groundtruth <- station.assign.daily.cluster(C0875.vector.filter.test,C0875.vector.filter.train.centroids)

Base Models Creation

Model 1 (J=2)

jointsize <- 2 

Create Argmax Model

#Argmax model creation
C0875.vector.filter.train <- station.add.missing.profiles(C0875.vector.filter.train)
## Warning: package 'zoo' was built under R version 3.1.3
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
C0875.filter.train.joint <- station.joint.probability(C0875.vector.filter.train$clusterid,jointsize)
C0875.argmax.model <- model.argmax(C0875.filter.train.joint)
## hash-2.2.6 provided by Decision Patterns

Perform Prediction

#Prediction using the model
C0875.vector.filter.pred <-station.predict.daily.cluster(station.vector.clustered=Results.groundtruth,model=C0875.argmax.model)

All probabilities for each Prediction

#All probabilities using the model
C0875.filter.train.conditional <- station.conditional.probability(C0875.filter.train.joint)
C0875.vector.filter.All.Probs <- station.clusterid.probability(station.conditional.probability=C0875.filter.train.conditional,clusterid.vector = Results.groundtruth$clusterid)

M1 Outcome Dataframe (Prediction|Support|AllProbs)

Results.m1<-data.frame(
  Pred = C0875.vector.filter.pred$Daym0,
  Support = C0875.vector.filter.pred$Freq,
  All.Probs = C0875.vector.filter.All.Probs,
  Hist = C0875.vector.filter.pred$Hist
)
kable(head(Results.m1))
Pred Support All.Probs Hist
NA NA NA NA
2 56 0.125 0.5 0.160714285714286 0.125 0.0892857142857143 2
2 29 0.206896551724138 0.551724137931034 0.137931034482759 0.0344827586206897 0.0689655172413793 1
3 48 0.0416666666666667 0.291666666666667 0.333333333333333 0.166666666666667 0.166666666666667 3
3 48 0.0416666666666667 0.291666666666667 0.333333333333333 0.166666666666667 0.166666666666667 3
3 48 0.0416666666666667 0.291666666666667 0.333333333333333 0.166666666666667 0.166666666666667 3

Model 2 (J=3)

jointsize <- 3 

Create Argmax Model

#Argmax model creation
C0875.filter.train.joint <- station.joint.probability(C0875.vector.filter.train$clusterid,jointsize)
C0875.argmax.model <- model.argmax(C0875.filter.train.joint)

Perform Prediction

#Prediction using the model
C0875.vector.filter.pred <-station.predict.daily.cluster(station.vector.clustered=Results.groundtruth,model=C0875.argmax.model)

All probabilities for each Prediction

#All probabilities using the model
C0875.filter.train.conditional <- station.conditional.probability(C0875.filter.train.joint)
C0875.vector.filter.All.Probs <- station.clusterid.probability(station.conditional.probability=C0875.filter.train.conditional,clusterid.vector = Results.groundtruth$clusterid)

M2 Outcome Dataframe (Prediction|Support|AllProbs)

Results.m2<-data.frame(
  Pred = C0875.vector.filter.pred$Daym0,
  Support = C0875.vector.filter.pred$Freq,
  All.Probs = C0875.vector.filter.All.Probs,
  Hist = C0875.vector.filter.pred$Hist
)
kable(head(Results.m2))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
2 7 0.142857142857143 0.571428571428571 0.142857142857143 0.142857142857143 2 1
2 4 0.75 0.25 1 3
3 14 0.357142857142857 0.5 0.0714285714285714 0.0714285714285714 3 3
3 14 0.357142857142857 0.5 0.0714285714285714 0.0714285714285714 3 3

Model 3 (J=4)

kable(head(Results.m3))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
2 1 1 2 1 3
NA 0 NA 1 3 3
3 7 0.428571428571429 0.571428571428571 3 3 3

Model 4 (J=5)

kable(head(Results.m4))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA 0 NA 2 1 3 3
NA 0 NA 1 3 3 3

Model 5 (J=6)

kable(head(Results.m5))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA 0 NA 2 1 3 3 3

Model 6 (J=7)

kable(head(Results.m6))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA

Model 7 (J=8)

kable(head(Results.m7))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA

Entropy Arbtitrer

Minimum entropy selection criteria

Results.decision <- min.entropy.arbitrer(data.frame(
  Results.m1$All.Probs,
  Results.m2$All.Probs,
  Results.m3$All.Probs,  
  Results.m4$All.Probs,  
  Results.m5$All.Probs,
  Results.m6$All.Probs,
  Results.m7$All.Probs  
  ),
  data.frame(
  Results.m1$Support,
  Results.m2$Support,
  Results.m3$Support,  
  Results.m4$Support,  
  Results.m5$Support,
  Results.m6$Support,
  Results.m7$Support
  )
)
Results.decision.no.support <- min.entropy.arbitrer(data.frame(
  Results.m1$All.Probs,
  Results.m2$All.Probs,
  Results.m3$All.Probs,  
  Results.m4$All.Probs,  
  Results.m5$All.Probs,
  Results.m6$All.Probs,
  Results.m7$All.Probs  
  ))

Error Calculation

Ensemble Prediction

Results <- data.frame(
  groundtruth = Results.groundtruth,
  m1 = Results.m1,
  m2 = Results.m2,
  m3 = Results.m3,  
  m4 = Results.m4,  
  m5 = Results.m5,
  m6 = Results.m6,  
  m7 = Results.m7,    
  decision = Results.decision, #Returns Inf if not a single entropy is available due to lack of hist. 
  decision.nosupport = Results.decision.no.support 
)

Output the final predictions using the arbitrer WITH support

Ensemble.Prediction <- data.frame(
  m1.pred = Results.m1$Pred,
  m2.pred = Results.m2$Pred,
  m3.pred = Results.m3$Pred,  
  m4.pred = Results.m4$Pred,  
  m5.pred = Results.m5$Pred,
  m6.pred = Results.m6$Pred,
  m7.pred = Results.m7$Pred,  
  decision = Results.decision$Index
)
Results$arbitrerPred <- Ensemble.Prediction[cbind(1:nrow(Ensemble.Prediction), Ensemble.Prediction$decision)]

Output the final predictions using the arbitrer WITHOUT support

Ensemble.Prediction.no.support <- data.frame(
  m1.pred = Results.m1$Pred,
  m2.pred = Results.m2$Pred,
  m3.pred = Results.m3$Pred,  
  m4.pred = Results.m4$Pred,  
  m5.pred = Results.m5$Pred,
  m6.pred = Results.m6$Pred,
  m7.pred = Results.m7$Pred,  
  decision = Results.decision.no.support$Index
)
Results$arbitrerPredNoSupport <- Ensemble.Prediction.no.support[cbind(1:nrow(Ensemble.Prediction.no.support), Ensemble.Prediction.no.support$decision)]

NA to Most Frequent Centroid

a <- summary(as.factor(C0875.vector.filter.train$clusterid[!is.na(C0875.vector.filter.train$clusterid)]))
base.centroid <- which(a == max(a))
Ensemble.Prediction$m1.pred[is.na(Ensemble.Prediction$m1.pred)] <- base.centroid
Ensemble.Prediction$m2.pred[is.na(Ensemble.Prediction$m2.pred)] <- base.centroid
Ensemble.Prediction$m3.pred[is.na(Ensemble.Prediction$m3.pred)] <- base.centroid
Ensemble.Prediction$m4.pred[is.na(Ensemble.Prediction$m4.pred)] <- base.centroid
Ensemble.Prediction$m5.pred[is.na(Ensemble.Prediction$m5.pred)] <- base.centroid
Ensemble.Prediction$m6.pred[is.na(Ensemble.Prediction$m6.pred)] <- base.centroid
## Warning in `[<-.factor`(`*tmp*`, is.na(Ensemble.Prediction$m6.pred), value
## = structure(c(NA, : invalid factor level, NA generated
Ensemble.Prediction$m7.pred[is.na(Ensemble.Prediction$m7.pred)] <- base.centroid
## Warning in `[<-.factor`(`*tmp*`, is.na(Ensemble.Prediction$m7.pred), value
## = structure(c(NA, : invalid factor level, NA generated
Results$arbitrerPred[is.na(Results$arbitrerPred)] <- base.centroid
Results$arbitrerPredNoSupport[is.na(Results$arbitrerPredNoSupport)] <- base.centroid

Individual Errors

Base Model 1 Error

m1.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m1.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m1.error)