library(solarf,quietly = TRUE)
library(knitr,quietly = TRUE)
library(ggplot2,quietly = TRUE)
library(klaR,quietly= TRUE) #Naive Bayes Library
library(GGally)
library(ggplot2)
library(plyr)
library(zoo)
#Theme for Plot
science_theme = theme(panel.background=element_blank(), panel.grid.major = element_line(size = 0.5, color = "grey"),
axis.line = element_line(size = 0.7, color = "black"), legend.position = c(0.9,
0.7), text = element_text(size = 14))
#Load data
#C0875 <- readRDS("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875.rds")
C0875 <- read.csv("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875_2011_2014.csv")
station.vectorize.hour <- function(station.dataframe,variable){
library(reshape2)
use.only.last.ocurrence <- function(minute.obs){
return (minute.obs[1])
}
station.dataframe.vectorized <- dcast(data=station.dataframe, formula=YEAR + MON + DAY + HR ~ MIN,value.var = variable,fun.aggregate=use.only.last.ocurrence)
return (station.dataframe.vectorized)
}
C0875.vector.h <- station.vectorize.hour(C0875,"SOLR")
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h$SOLR <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins.
C0875.h$HR <- C0875.h$HR + 1
C0875.vector <- station.vectorize(C0875.h,"SOLR")
Vectorize C0875 over the training and test partitions.
#C0875$SOLR <- C0875$SOLR
#C0875.vector <- station.vectorize(C0875,"SOLR")
C0875.vector <- C0875.vector[,c(1:3,11:20)]
#C0875.vector.filter <- station.vectorize(C0875[!is.na(C0875$SOLR),],"SOLR")
#C0875.vector.filter <- C0875.vector.filter[complete.cases(C0875.vector.filter),c(1:3,12:21)]
C0875.vector.filter <- C0875.vector[complete.cases(C0875.vector),]
#Separate the year of 2014 for prediction testing
C0875.vector.filter.test <- C0875.vector.filter[C0875.vector.filter$YEAR==2014,]
C0875.vector.filter.train <- C0875.vector.filter[C0875.vector.filter$YEAR!=2014,]
#C0875.vector.filter.train <- C0875.vector.filter[C0875.vector.filter$MON==11 |
# C0875.vector.filter$MON==12 |
# C0875.vector.filter$MON==1 |
# C0875.vector.filter$MON==2 |
# C0875.vector.filter$MON==3 |
# C0875.vector.filter$MON==4
# ,]
Cluster with K-means = nclusters the training dataset and assign clusters to the daily profiles.
nclusters <- 5
set.seed(1234)
C0875.filter.train.clusters <- kmeans(C0875.vector.filter.train[,4:ncol(C0875.vector.filter)],nclusters)
C0875.vector.filter.train$clusterid <- C0875.filter.train.clusters[[1]]
C0875.vector.filter.train.centroids <- C0875.filter.train.clusters[[2]]
intensity.crescent.ordered.clusterid <- as.numeric(names(sort(rowSums(C0875.vector.filter.train.centroids))))
#Order the centroids in crescent order of ipntensity and reset rows
C0875.vector.filter.train.centroids <- C0875.vector.filter.train.centroids[intensity.crescent.ordered.clusterid,]
rownames(C0875.vector.filter.train.centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
C0875.vector.filter.train$clusterid <- intensity.crescent.ordered.clusterid[C0875.vector.filter.train$clusterid]
centroids_plot <- data.frame(C0875.vector.filter.train.centroids)
centroids_plot$color <-rownames(C0875.vector.filter.train.centroids)
colnames(centroids_plot) <- c(colnames(C0875.vector.filter.train.centroids),"Profile")
kable(C0875.vector.filter.train.centroids)
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
|---|---|---|---|---|---|---|---|---|---|
| 63.75098 | 169.7902 | 273.9124 | 292.8852 | 312.6332 | 334.7674 | 341.3711 | 292.0897 | 236.2965 | 144.7556 |
| 85.89587 | 248.3484 | 427.8448 | 571.5542 | 629.9225 | 636.4579 | 573.0717 | 473.9575 | 343.4200 | 200.5349 |
| 95.86391 | 251.6029 | 441.4603 | 599.4685 | 742.0340 | 820.2648 | 836.0977 | 750.7302 | 556.1624 | 328.4749 |
| 196.82671 | 388.6677 | 599.1102 | 755.9045 | 814.2504 | 825.1088 | 758.6555 | 605.5889 | 455.8949 | 266.5185 |
| 207.56872 | 417.2371 | 629.2724 | 825.2460 | 954.5118 | 981.7954 | 940.3175 | 863.0599 | 680.0826 | 452.6717 |
ggparcoord(
data = centroids_plot,
columns=c(1:ncol(C0875.vector.filter.train.centroids)),
scale="globalminmax",
alphaLines=0.2,
groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
ylab("Solar Radiation Itensity (W/m2") + ggtitle("Solar Radiation Discretized Profiles from 2003 to 2013 of Schofield Barracks Station") + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))
#+ scale_fill_discrete(name="Discretized\ Profile", breaks=c("1", "2", "3","4","5"), labels=c("1 (Overcast)", "2","3","4","5"))# + scale_colour_brewer(palette="Spectral")
Results.groundtruth <- station.assign.daily.cluster(C0875.vector.filter.test,C0875.vector.filter.train.centroids)
slide_window <- function(vector,size){
df <- data.frame(rollapply(vector, width = size, by = 1, FUN = f1<-function(x){return (x)}, align = "left"))
return(df[complete.cases(df),])
}
calc_daily_mean_error <- function(train.vector,test.vector,size){
train <- slide_window(train.vector,size)
colnames(train) <- c(paste0("Stm",seq(size-1,1)),"St")
train[] <- lapply(train, as.factor)
test <- slide_window(test.vector,size)
colnames(test) <- c(paste0("Stm",seq(size-1,1)),"St")
test[] <- lapply(test, as.factor)
m <- NaiveBayes(St ~ ., data = train)
forecast <- as.numeric(predict(m,data.frame(test[,1]))$class)
daily.err <- error.absolute.centroid(Results.groundtruth[size:nrow(Results.groundtruth),4:13],forecast,C0875.vector.filter.train.centroids)
mean.daily.err <- mean(daily.err)
return (mean.daily.err)
}
Add rows on timestamps that do not occur on the data, and fill them with NAs. This is used so the slide window can verify if the days are consecutive.
C0875.vector.filter.train <- station.add.missing.profiles(C0875.vector.filter.train)
#Amount of Consecutive Days. 2 days <=> forecast using one day before.
err.means <- vector("list",7)
for(i in 2:8){
err.means[[i-1]] <- calc_daily_mean_error(C0875.vector.filter.train$clusterid,Results.groundtruth$clusterid,i)
}
df.err.means <- as.data.frame(err.means)
colnames(df.err.means) <- c("B1","B2","B3","B4","B5","B6","B7")
labels <- c("B1","B2","B3","B4","B5","B6","B7")
qplot(labels,as.numeric(df.err.means), geom="bar",stat="identity",xlab="Model",ylab="Daily Mean Absolute Error",main="Naive Bayes") + science_theme
kable(df.err.means)
| B1 | B2 | B3 | B4 | B5 | B6 | B7 |
|---|---|---|---|---|---|---|
| 1337.365 | 1366.718 | 1405.523 | 1427.534 | 1438.938 | 1444.905 | 1513.674 |