Libraries and Data Pre-Processing

#Load Required Libraries
library(solarf)
library(knitr) #For pretty tables on the report using kable 
library(plyr)
library(ggplot2)
library(GGally)
library(caret)
library(extrafont)

#Theme for Plot
science_theme = theme(
    panel.background=element_blank(), 
    #panel.grid.major = element_line(size = 0.5, color = "grey"),
    #axis.line = element_line(size = 0.7, color = "black"), 
    #legend.position = c(0.9, 0.7), 
    #text = element_text(size = 14),
    axis.line=element_blank(),
    text = element_text(size=15, family="Interstate"),
    axis.text.x = element_text(hjust = 0, size=15,color="black"), 
    axis.text.y = element_text(hjust = 0, size=15,color="black"), 
    axis.title.y=element_text(size=15), 
    axis.title.x=element_text(size=15),
    axis.ticks = element_line(colour=NA),
    panel.grid.major = element_line(colour = 'gray', linetype = 'dashed'),
    panel.grid.minor = element_line(colour = NA),  
    panel.background = element_rect(colour = 'white'))

Load Station C0875 (Southeast Oahu Island)

C0875 <- read.csv("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875_2011_2014.csv")

station.vectorize.hour <- function(station.dataframe,variable){
    library(reshape2)
    use.only.last.ocurrence <- function(minute.obs){
        return (minute.obs[1])
    }
    
    station.dataframe.vectorized <- dcast(data=station.dataframe, formula=YEAR + MON + DAY + HR ~ MIN,value.var = variable,fun.aggregate=use.only.last.ocurrence)
    return (station.dataframe.vectorized)
}

Vectorize Step

Vectorize C0875 over the training and test partitions.

#Moves from minute level to hourly level
C0875.vector.h <- station.vectorize.hour(C0875,"SOLR")
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h$MIN <- 0
C0875.h$HR <- C0875.h$HR + 1
C0875.h$SOLR <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins. 

C0875.h <- readRDS("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/SCBH1.rds")
#C0875.h <- C0875.h[C0875.h$YEAR > 2011,]

C0875.vector <- station.vectorize(C0875.h,"SOLR")
C0875.vector <- C0875.vector[,c(1:3,12:21)] #SOLR starts on Feb 15, 2012. Change to 12:21 for schb1

Data Point Filtering

#C0875.vector.filter <- station.vectorize(C0875[!is.na(C0875$SOLR),],"SOLR")
C0875.vector.filter <- C0875.vector[complete.cases(C0875.vector),]

Train and Test Split

#Separate the year of 2014 for prediction testing 
C0875.vector.filter.test <- C0875.vector.filter[C0875.vector.filter$YEAR==2014,]
C0875.vector.filter.train <- C0875.vector.filter[C0875.vector.filter$YEAR!=2014,]

#C0875.vector.filter.train <- C0875.vector.filter[C0875.vector.filter$MON==11 |
#                                                     C0875.vector.filter$MON==12 |
#                                                     C0875.vector.filter$MON==1 |
#                                                     C0875.vector.filter$MON==2 |
#                                                     C0875.vector.filter$MON==3 |
#                                                     C0875.vector.filter$MON==4
#                                                     ,]

Cluster Step

Cluster Train Dataset

Cluster with K-means = nclusters the training dataset and assign clusters to the daily profiles.

nclusters <- 5
set.seed(1234)
C0875.filter.train.clusters <- kmeans(C0875.vector.filter.train[,4:ncol(C0875.vector.filter)],nclusters)
C0875.vector.filter.train$clusterid <- C0875.filter.train.clusters[[1]]
C0875.vector.filter.train.centroids <- C0875.filter.train.clusters[[2]]

intensity.crescent.ordered.clusterid <- as.numeric(names(sort(rowSums(C0875.vector.filter.train.centroids))))
#Order the centroids in crescent order of ipntensity and reset rows
C0875.vector.filter.train.centroids <- C0875.vector.filter.train.centroids[intensity.crescent.ordered.clusterid,]
rownames(C0875.vector.filter.train.centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
C0875.vector.filter.train$clusterid <- intensity.crescent.ordered.clusterid[C0875.vector.filter.train$clusterid]

Centroids

centroids_plot <- data.frame(C0875.vector.filter.train.centroids)
centroids_plot$color <-rownames(C0875.vector.filter.train.centroids)
colnames(centroids_plot) <- c(colnames(C0875.vector.filter.train.centroids),"Profile")
kable(C0875.vector.filter.train.centroids)
8 9 10 11 12 13 14 15 16 17
96.45584 145.7208 184.0028 208.9416 225.6738 235.2094 239.6068 238.3433 203.0014 111.5228
275.89212 431.7226 542.3784 549.3476 472.7911 394.2021 338.9863 254.0959 164.6301 67.0274
171.47927 302.4083 428.9764 542.8919 646.5695 679.3775 633.5432 530.5714 347.3910 149.9687
362.70761 583.1298 739.1073 815.1869 784.6263 689.3010 566.5087 413.2837 266.8270 109.6263
355.95481 572.0663 756.5471 896.4277 969.5854 957.8625 856.8865 690.5565 459.9856 198.4452

Centroids

ggparcoord(
  data = centroids_plot,
  columns=c(1:ncol(C0875.vector.filter.train.centroids)),
  scale="globalminmax",
  alphaLines=0.2,
  groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
  ylab("Solar Irradiance (W/m^2)") + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))

#+ scale_fill_discrete(name="Discretized\ Profile", breaks=c("1", "2", "3","4","5"), labels=c("1 (Overcast)", "2","3","4","5"))# + scale_colour_brewer(palette="Spectral")

Cluster Test Dataset (Ground Truth)

Results.groundtruth <- station.assign.daily.cluster(C0875.vector.filter.test,C0875.vector.filter.train.centroids)

Base Models Creation

Model 1 (J=2)

jointsize <- 2 

Create Argmax Model

#Argmax model creation
C0875.vector.filter.train <- station.add.missing.profiles(C0875.vector.filter.train)
## Warning: package 'zoo' was built under R version 3.1.3
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
C0875.filter.train.joint <- station.joint.probability(C0875.vector.filter.train$clusterid,jointsize)
C0875.argmax.model <- model.argmax(C0875.filter.train.joint)
## hash-2.2.6 provided by Decision Patterns

Perform Prediction

#Prediction using the model
C0875.vector.filter.pred <-station.predict.daily.cluster(station.vector.clustered=Results.groundtruth,model=C0875.argmax.model)

All probabilities for each Prediction

#All probabilities using the model
C0875.filter.train.conditional <- station.conditional.probability(C0875.filter.train.joint)
C0875.vector.filter.All.Probs <- station.clusterid.probability(station.conditional.probability=C0875.filter.train.conditional,clusterid.vector = Results.groundtruth$clusterid)

M1 Outcome Dataframe (Prediction|Support|AllProbs)

Results.m1<-data.frame(
  Pred = C0875.vector.filter.pred$Daym0,
  Support = C0875.vector.filter.pred$Freq,
  All.Probs = C0875.vector.filter.All.Probs,
  Hist = C0875.vector.filter.pred$Hist
)
kable(head(Results.m1))
Pred Support All.Probs Hist
NA NA NA NA
1 482 0.462655601659751 0.0829875518672199 0.0186721991701245 0.282157676348548 0.153526970954357 1
1 482 0.462655601659751 0.0829875518672199 0.0186721991701245 0.282157676348548 0.153526970954357 1
1 482 0.462655601659751 0.0829875518672199 0.0186721991701245 0.282157676348548 0.153526970954357 1
1 482 0.462655601659751 0.0829875518672199 0.0186721991701245 0.282157676348548 0.153526970954357 1
1 482 0.462655601659751 0.0829875518672199 0.0186721991701245 0.282157676348548 0.153526970954357 1

Model 2 (J=3)

jointsize <- 3 

Create Argmax Model

#Argmax model creation
C0875.filter.train.joint <- station.joint.probability(C0875.vector.filter.train$clusterid,jointsize)
C0875.argmax.model <- model.argmax(C0875.filter.train.joint)

Perform Prediction

#Prediction using the model
C0875.vector.filter.pred <-station.predict.daily.cluster(station.vector.clustered=Results.groundtruth,model=C0875.argmax.model)

All probabilities for each Prediction

#All probabilities using the model
C0875.filter.train.conditional <- station.conditional.probability(C0875.filter.train.joint)
C0875.vector.filter.All.Probs <- station.clusterid.probability(station.conditional.probability=C0875.filter.train.conditional,clusterid.vector = Results.groundtruth$clusterid)

M2 Outcome Dataframe (Prediction|Support|AllProbs)

Results.m2<-data.frame(
  Pred = C0875.vector.filter.pred$Daym0,
  Support = C0875.vector.filter.pred$Freq,
  All.Probs = C0875.vector.filter.All.Probs,
  Hist = C0875.vector.filter.pred$Hist
)
kable(head(Results.m2))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
1 207 0.507246376811594 0.072463768115942 0.00966183574879227 0.280193236714976 0.130434782608696 1 1
1 207 0.507246376811594 0.072463768115942 0.00966183574879227 0.280193236714976 0.130434782608696 1 1
1 207 0.507246376811594 0.072463768115942 0.00966183574879227 0.280193236714976 0.130434782608696 1 1
1 207 0.507246376811594 0.072463768115942 0.00966183574879227 0.280193236714976 0.130434782608696 1 1

Model 3 (J=4)

kable(head(Results.m3))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
1 101 0.554455445544554 0.0693069306930693 0.0099009900990099 0.267326732673267 0.099009900990099 1 1 1
1 101 0.554455445544554 0.0693069306930693 0.0099009900990099 0.267326732673267 0.099009900990099 1 1 1
1 101 0.554455445544554 0.0693069306930693 0.0099009900990099 0.267326732673267 0.099009900990099 1 1 1

Model 4 (J=5)

kable(head(Results.m4))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
1 53 0.566037735849057 0.0943396226415094 0.283018867924528 0.0566037735849057 1 1 1 1
1 53 0.566037735849057 0.0943396226415094 0.283018867924528 0.0566037735849057 1 1 1 1

Model 5 (J=6)

kable(head(Results.m5))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
1 29 0.655172413793103 0.137931034482759 0.172413793103448 0.0344827586206897 1 1 1 1 1

Model 6 (J=7)

kable(head(Results.m6))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA

Model 7 (J=8)

kable(head(Results.m7))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA

Entropy Arbtitrer

Minimum entropy selection criteria

Results.decision <- min.entropy.arbitrer(data.frame(
  Results.m1$All.Probs,
  Results.m2$All.Probs,
  Results.m3$All.Probs,  
  Results.m4$All.Probs,  
  Results.m5$All.Probs,
  Results.m6$All.Probs,
  Results.m7$All.Probs  
  ),
  data.frame(
  Results.m1$Support,
  Results.m2$Support,
  Results.m3$Support,  
  Results.m4$Support,  
  Results.m5$Support,
  Results.m6$Support,
  Results.m7$Support
  )
)
Results.decision.no.support <- min.entropy.arbitrer(data.frame(
  Results.m1$All.Probs,
  Results.m2$All.Probs,
  Results.m3$All.Probs,  
  Results.m4$All.Probs,  
  Results.m5$All.Probs,
  Results.m6$All.Probs,
  Results.m7$All.Probs  
  ))

Error Calculation

Ensemble Prediction

Results <- data.frame(
  groundtruth = Results.groundtruth,
  m1 = Results.m1,
  m2 = Results.m2,
  m3 = Results.m3,  
  m4 = Results.m4,  
  m5 = Results.m5,
  m6 = Results.m6,  
  m7 = Results.m7,    
  decision = Results.decision, #Returns Inf if not a single entropy is available due to lack of hist. 
  decision.nosupport = Results.decision.no.support 
)

Output the final predictions using the arbitrer WITH support

Ensemble.Prediction <- data.frame(
  m1.pred = Results.m1$Pred,
  m2.pred = Results.m2$Pred,
  m3.pred = Results.m3$Pred,  
  m4.pred = Results.m4$Pred,  
  m5.pred = Results.m5$Pred,
  m6.pred = Results.m6$Pred,
  m7.pred = Results.m7$Pred,  
  decision = Results.decision$Index
)
Results$arbitrerPred <- Ensemble.Prediction[cbind(1:nrow(Ensemble.Prediction), Ensemble.Prediction$decision)]

Output the final predictions using the arbitrer WITHOUT support

Ensemble.Prediction.no.support <- data.frame(
  m1.pred = Results.m1$Pred,
  m2.pred = Results.m2$Pred,
  m3.pred = Results.m3$Pred,  
  m4.pred = Results.m4$Pred,  
  m5.pred = Results.m5$Pred,
  m6.pred = Results.m6$Pred,
  m7.pred = Results.m7$Pred,  
  decision = Results.decision.no.support$Index
)
Results$arbitrerPredNoSupport <- Ensemble.Prediction.no.support[cbind(1:nrow(Ensemble.Prediction.no.support), Ensemble.Prediction.no.support$decision)]

NA to Most Frequent Centroid

a <- summary(as.factor(C0875.vector.filter.train$clusterid[!is.na(C0875.vector.filter.train$clusterid)]))
base.centroid <- which(a == max(a))
Ensemble.Prediction$m1.pred[is.na(Ensemble.Prediction$m1.pred)] <- base.centroid
Ensemble.Prediction$m2.pred[is.na(Ensemble.Prediction$m2.pred)] <- base.centroid
Ensemble.Prediction$m3.pred[is.na(Ensemble.Prediction$m3.pred)] <- base.centroid
Ensemble.Prediction$m4.pred[is.na(Ensemble.Prediction$m4.pred)] <- base.centroid
Ensemble.Prediction$m5.pred[is.na(Ensemble.Prediction$m5.pred)] <- base.centroid
Ensemble.Prediction$m6.pred[is.na(Ensemble.Prediction$m6.pred)] <- base.centroid
Ensemble.Prediction$m7.pred[is.na(Ensemble.Prediction$m7.pred)] <- base.centroid
Results$arbitrerPred[is.na(Results$arbitrerPred)] <- base.centroid
Results$arbitrerPredNoSupport[is.na(Results$arbitrerPredNoSupport)] <- base.centroid

Individual Errors

Base Model 1 Error

m1.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m1.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m1.error)

mean(m1.error,na.rm=TRUE)
## [1] 1615.588
sum(m1.error,na.rm=TRUE)/(10*length(m1.error[!is.na(m1.error)]))
## [1] 161.5588

Base Model 2 Error

m2.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m2.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m2.error)

mean(m2.error,na.rm=TRUE)
## [1] 1749.771
sum(m2.error,na.rm=TRUE)/(10*length(m2.error[!is.na(m2.error)]))
## [1] 174.9771

Base Model 3 Error

m3.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m3.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m3.error)

mean(m3.error,na.rm=TRUE)
## [1] 1759.679
sum(m3.error,na.rm=TRUE)/(10*length(m3.error[!is.na(m3.error)]))
## [1] 175.9679

Base Model 4 Error

m4.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m4.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m4.error)

mean(m4.error,na.rm=TRUE)
## [1] 1812.23
sum(m4.error,na.rm=TRUE)/(10*length(m4.error[!is.na(m4.error)]))
## [1] 181.223

Base Model 5 Error

m5.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m5.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m5.error)

mean(m5.error,na.rm=TRUE)
## [1] 1788.589
sum(m5.error,na.rm=TRUE)/(10*length(m5.error[!is.na(m5.error)]))
## [1] 178.8589

Base Model 6 Error

m6.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m6.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m6.error)

mean(m6.error,na.rm=TRUE)
## [1] 1754.356
sum(m6.error,na.rm=TRUE)/(10*length(m6.error[!is.na(m6.error)]))
## [1] 175.4356

Base Model 7 Error

m7.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m7.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m7.error)

mean(m7.error,na.rm=TRUE)
## [1] 1728.86
sum(m7.error,na.rm=TRUE)/(10*length(m7.error[!is.na(m7.error)]))
## [1] 172.886

Error From Ensemble Prediction WITHOUT support

ensemble.error.no.support <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Results$arbitrerPredNoSupport,cluster.centroids=C0875.vector.filter.train.centroids)

hist(ensemble.error.no.support)

mean(ensemble.error.no.support,na.rm=TRUE)
## [1] 1887.043
sum(ensemble.error.no.support,na.rm=TRUE)/(10*length(ensemble.error.no.support[!is.na(ensemble.error.no.support)]))
## [1] 188.7043

Error From Ensemble Prediction WITH support

ensemble.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Results$arbitrerPred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(ensemble.error)

mean(ensemble.error,na.rm=TRUE)
## [1] 1918.836
sum(ensemble.error,na.rm=TRUE)/(10*length(ensemble.error[!is.na(ensemble.error)]))
## [1] 191.8836

Arbiter Base Model Selection Count

Support & Entropy

How much from each model is the ensemble using WITH the support weight?

qplot(decision, data=Ensemble.Prediction, geom="histogram",ylab="Frequency",xlab="w") + scale_x_discrete(limits=c("2","3","4","5","6","7","8")) + science_theme

Entropy

qplot(decision, data=Ensemble.Prediction.no.support, geom="histogram",ylab="Frequency",xlab="w") + scale_x_discrete(limits=c("2","3","4","5","6","7","8")) + science_theme + theme(axis.text.x = element_text(hjust = 0.5, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))  + theme( # remove the vertical grid lines
           panel.grid.major.x = element_blank() ,
           # explicitly set the horizontal lines (or they will disappear too)
           panel.grid.major.y = element_line( size=.1, color="black" )
    ) 

It is noticiable that model B5 is not chosen as much anymore, and B1 now is chosen less than B2. B3 and B4 are now chosen much more for the prediction. This effect gives hopes that now the ensemble will do better, given it is choosen the model with the least amount of error more (B3).

Daily Mean Absolute Error per Model

mean_error <- data.frame(error=c(
as.integer(mean(m1.error,na.rm=TRUE)),
as.integer(mean(m2.error,na.rm=TRUE)),
as.integer(mean(m3.error,na.rm=TRUE)),
as.integer(mean(m4.error,na.rm=TRUE)),
as.integer(mean(m5.error,na.rm=TRUE)),
as.integer(mean(m6.error,na.rm=TRUE)),
as.integer(mean(m7.error,na.rm=TRUE)),
as.integer(mean(ensemble.error.no.support,na.rm=TRUE)),
as.integer(mean(ensemble.error,na.rm=TRUE))
))
labels <- c("B1","B2","B3","B4","B5","B6","B7","E","ESpt")

qplot(labels,mean_error$error, geom="bar",stat="identity",xlab="Model",ylab="Daily Mean Absolute Error") + science_theme

rownames(mean_error) <- labels
kable(mean_error)
error
B1 1615
B2 1749
B3 1759
B4 1812
B5 1788
B6 1754
B7 1728
E 1887
ESpt 1918

Daily Hourly Absolute Error per Model (HECO)

mean_error <- data.frame(error=c(
as.integer(sum(m1.error,na.rm=TRUE)/(10*length(m1.error[!is.na(m1.error)]))),
as.integer(sum(m2.error,na.rm=TRUE)/(10*length(m2.error[!is.na(m2.error)]))),
as.integer(sum(m3.error,na.rm=TRUE)/(10*length(m3.error[!is.na(m3.error)]))),
as.integer(sum(m4.error,na.rm=TRUE)/(10*length(m4.error[!is.na(m4.error)]))),
as.integer(sum(m5.error,na.rm=TRUE)/(10*length(m5.error[!is.na(m5.error)]))),
as.integer(sum(m6.error,na.rm=TRUE)/(10*length(m6.error[!is.na(m6.error)]))),
as.integer(sum(m7.error,na.rm=TRUE)/(10*length(m7.error[!is.na(m7.error)]))),
as.integer(sum(ensemble.error.no.support,na.rm=TRUE)/(10*length(ensemble.error.no.support[!is.na(ensemble.error.no.support)]))),
as.integer(sum(ensemble.error,na.rm=TRUE)/(10*length(ensemble.error[!is.na(ensemble.error)])))
))
labels <- c("2","3","4","5","6","7","8","Ent","Spt")
qplot(labels,mean_error$error, geom="bar",stat="identity",xlab="w",ylab="Mean Absolute Error (W/m^2)") + science_theme + theme(axis.text.x = element_text(hjust = 0.5, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20)) + ylim(0, 400) + theme( # remove the vertical grid lines
           panel.grid.major.x = element_blank() ,
           # explicitly set the horizontal lines (or they will disappear too)
           panel.grid.major.y = element_line( size=.1, color="black" )
    ) 

rownames(mean_error) <- labels
kable(mean_error)
error
2 161
3 174
4 175
5 181
6 178
7 175
8 172
Ent 188
Spt 191

We can see that the use of the support value improves compared to just using the minimum entropy value in order to decise which base model to predict.

Daily Missing Prediction per Model

missing <- data.frame(missing=c(
round(length(Ensemble.Prediction$m1.pred[is.na(Ensemble.Prediction$m1.pred)])/345,3)*100,
round(length(Ensemble.Prediction$m2.pred[is.na(Ensemble.Prediction$m2.pred)])/345,3)*100,
round(length(Ensemble.Prediction$m3.pred[is.na(Ensemble.Prediction$m3.pred)])/345,3)*100,
round(length(Ensemble.Prediction$m4.pred[is.na(Ensemble.Prediction$m4.pred)])/345,3)*100,
round(length(Ensemble.Prediction$m5.pred[is.na(Ensemble.Prediction$m5.pred)])/345,3)*100,
round(length(Ensemble.Prediction$m6.pred[is.na(Ensemble.Prediction$m6.pred)])/345,3)*100,
round(length(Ensemble.Prediction$m7.pred[is.na(Ensemble.Prediction$m7.pred)])/345,3)*100,
round(length(Results$arbitrerPred[is.na(Results$arbitrerPred)])/345,3)*100,
round(length(Results$arbitrerPredNoSupport[is.na(Results$arbitrerPredNoSupport)])/345,3)*100
))
labels <- c("m1","m2","m3","m4","m5","m6","m7","Pm","PmSpt")

qplot(labels,missing$missing, geom="bar",stat="identity",xlab="Model",ylab=" % Daily Missing Predictions")  + science_theme  + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))