Load required libraries and plot template.
#Load Required Libraries
library(solarf)
library(knitr) #For pretty tables on the report using kable
library(plyr)
library(ggplot2)
library(GGally)
library(caret)
library(extrafont)
#Theme for Plot
science_theme =
theme(
panel.background=element_blank(),
#panel.grid.major = element_line(size = 0.5, color = "grey"),
#axis.line = element_line(size = 0.7, color = "black"),
#legend.position = c(0.9, 0.7),
#text = element_text(size = 14),
axis.line=element_blank(),
text = element_text(size=15, family="Interstate"),
axis.text.x = element_text(hjust = 1, size=15,color="black"),
axis.text.y = element_text(hjust = 1, size=15,color="black"),
axis.title.y=element_text(size=15),
axis.title.x=element_text(size=15),
axis.ticks = element_line(colour=NA),
panel.grid.major = element_line(colour = 'gray', linetype = 'dashed'),
panel.grid.minor = element_line(colour = NA),
panel.background = element_rect(colour = 'white'))
Data Preparation Parameters
variable <- "SOLR"
nclusters <- 5
#C0875.h <- readRDS("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875.h.rds")
C0875 <- read.csv("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875_2011_2014.csv")
## CHANGE GRANULARITY FUNCTION
station.vectorize.hour <- function(station.dataframe,variable){
library(reshape2)
use.only.last.ocurrence <- function(minute.obs){
return (minute.obs[1])
}
station.dataframe.vectorized <- dcast(data=station.dataframe, formula=YEAR + MON + DAY + HR ~ MIN,value.var = variable,fun.aggregate=use.only.last.ocurrence)
return (station.dataframe.vectorized)
}
##
## CHANGE GRANULARITY REP.CODE
C0875.vector.h <- station.vectorize.hour(C0875,variable)
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h[,variable] <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins.
C0875.h$HR <- C0875.h$HR + 1
C0875.h[is.nan(C0875.h[,variable]),variable] <- NA
##END CHANGE GRANULARITY
Vectorize C0875.h over the training and test partitions.
vector <- station.vectorize(C0875.h,variable)
vector <- vector[,c(1:3,11:20)]
If SOLR is missing any hour value, the day is discarded.
vector <- vector[complete.cases(vector),]
Cluster with 5-means the dataset to obtain the daily profiles.
#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]
#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]
Define the istransition variable by observing if the previous day had a different clusterid than the current.
vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
if(vector$istransition[i-1] == vector$istransition[i])
vector$istransition[i] <- 0 #False for C++
else
vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.solar <- vector
Sample 5 vectors.
kable(vector[sample(nrow(vector),5),])
| YEAR | MON | DAY | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | clusterid | istransition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 956 | 2014 | 2 | 25 | 58.70000 | 153.0000 | 290.0833 | 743.4167 | 591.4167 | 545.7500 | 407.2500 | 244.1667 | 215.5833 | 266.0833 | 3 | 1 |
| 433 | 2012 | 4 | 29 | 219.50000 | 530.9167 | 800.2500 | 547.0833 | 627.7500 | 740.2500 | 820.0000 | 859.8333 | 632.8333 | 291.0000 | 4 | 1 |
| 644 | 2013 | 3 | 9 | 60.91667 | 153.0833 | 254.4167 | 375.0000 | 692.0000 | 674.0000 | 1003.8333 | 726.9167 | 615.0833 | 490.0000 | 4 | 1 |
| 970 | 2014 | 3 | 11 | 174.00000 | 414.6667 | 613.9167 | 696.8333 | 879.0000 | 942.4167 | 913.0833 | 815.2500 | 657.5833 | 451.0000 | 5 | 1 |
| 850 | 2013 | 11 | 7 | 158.77778 | 382.0000 | 574.5833 | 758.3333 | 771.3333 | 446.0833 | 268.2727 | 378.6667 | 209.5833 | 212.1000 | 1 | 0 |
Centroids.
centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
|---|---|---|---|---|---|---|---|---|---|
| 41.81761 | 125.2847 | 234.6747 | 260.8534 | 248.8749 | 251.4028 | 259.4250 | 263.0444 | 205.4177 | 132.6385 |
| 86.53531 | 231.7437 | 383.0082 | 489.5958 | 502.4530 | 550.8664 | 545.9115 | 442.7662 | 321.3274 | 184.9058 |
| 114.53358 | 287.3950 | 491.2309 | 664.8130 | 753.6040 | 744.1934 | 620.3279 | 468.9166 | 348.7987 | 211.3311 |
| 129.88862 | 305.6151 | 488.8596 | 631.5216 | 744.7629 | 814.8602 | 811.9553 | 720.0813 | 535.4632 | 317.5342 |
| 206.28015 | 417.7976 | 638.0186 | 826.6715 | 949.2170 | 977.1658 | 941.4765 | 847.7259 | 664.0621 | 448.9646 |
ggparcoord(
data = centroids_plot,
columns=c(1:(ncol(centroids)-1)),
scale="globalminmax",
alphaLines=0.2,
groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('C0875.h',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))
Data Preparation Parameters
variable <- "ALTI"
nclusters <- 5
Vectorize C0875.h over the training and test partitions.
## CHANGE GRANULARITY REP.CODE
C0875.vector.h <- station.vectorize.hour(C0875,variable)
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h[,variable] <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins.
C0875.h$HR <- C0875.h$HR + 1
C0875.h[is.nan(C0875.h[,variable]),variable] <- NA
##END CHANGE GRANULARITY
vector <- station.vectorize(C0875.h,variable)
vector <- vector[,c(1:3,11:20)]
If ALTI is missing any hour value, the day is discarded.
vector <- vector[complete.cases(vector),]
#Set seed for reproducibility
set.seed(1234)
#Mean precipitation of each day
mean.prec <- rowMeans(vector[,4:ncol(vector)])
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.1.3
## Loading required package: grid
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.1.3
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.1.3
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:plyr':
##
## is.discrete, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
#Min and max
bins <- cut(mean.prec,breaks=5)
interval_labels <- levels(cut(mean.prec,breaks=5))
bin_interval_mean <- function(interval_label){
lower_and_upper <- unlist(strsplit(interval_label, ",", fixed = TRUE))
lower <- as.numeric(substr(lower_and_upper[1],start=2,stop=nchar(lower_and_upper[1])))
upper <- as.numeric(substr(lower_and_upper[2],start=1,stop=nchar(lower_and_upper[2])-1))
return (mean(c(lower,upper)))
}
centroids[1,1:ncol(centroids)] <- bin_interval_mean(interval_labels[1])
centroids[2,1:ncol(centroids)] <- bin_interval_mean(interval_labels[2])
centroids[3,1:ncol(centroids)] <- bin_interval_mean(interval_labels[3])
centroids[4,1:ncol(centroids)] <- bin_interval_mean(interval_labels[4])
centroids[5,1:ncol(centroids)] <- bin_interval_mean(interval_labels[5])
vector$clusterid <- bins
#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]
Define the istransition variable by observing if the previous day had a different clusterid than the current.
vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
if(vector$istransition[i-1] == vector$istransition[i])
vector$istransition[i] <- 0 #False for C++
else
vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.prec <- vector
Sample 5 vectors.
kable(vector[sample(nrow(vector),5),])
| YEAR | MON | DAY | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | clusterid | istransition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 154 | 2011 | 6 | 13 | 30.07333 | 30.0900 | 30.09500 | 30.10167 | 30.09889 | 30.09333 | 30.08583 | 30.07000 | 30.07000 | 30.06833 | 4 | 1 |
| 783 | 2013 | 8 | 28 | 30.01000 | 30.0100 | 30.01000 | 29.99833 | 29.98333 | 29.96667 | 29.95250 | 29.94000 | 29.93667 | 29.93000 | 3 | 1 |
| 763 | 2013 | 7 | 24 | 30.00583 | 30.0100 | 30.00667 | 29.99500 | 29.98917 | 29.98083 | 29.97000 | 29.95455 | 29.94091 | 29.93545 | 3 | 1 |
| 782 | 2013 | 8 | 27 | 30.10400 | 30.1100 | 30.11000 | 30.11000 | 30.09500 | 30.07000 | 30.04833 | 30.03083 | 30.01333 | 30.01083 | 4 | 1 |
| 1063 | 2014 | 7 | 12 | 30.00000 | 30.0025 | 30.00000 | 29.99583 | 29.98583 | 29.98000 | 29.97250 | 29.96667 | 29.96000 | 29.96000 | 3 | 1 |
Centroids.
centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
|---|---|---|---|---|---|---|---|---|---|
| 29.810 | 29.810 | 29.810 | 29.810 | 29.810 | 29.810 | 29.810 | 29.810 | 29.810 | 29.810 |
| 29.905 | 29.905 | 29.905 | 29.905 | 29.905 | 29.905 | 29.905 | 29.905 | 29.905 | 29.905 |
| 29.995 | 29.995 | 29.995 | 29.995 | 29.995 | 29.995 | 29.995 | 29.995 | 29.995 | 29.995 |
| 30.090 | 30.090 | 30.090 | 30.090 | 30.090 | 30.090 | 30.090 | 30.090 | 30.090 | 30.090 |
| 30.185 | 30.185 | 30.185 | 30.185 | 30.185 | 30.185 | 30.185 | 30.185 | 30.185 | 30.185 |
ggparcoord(
data = centroids_plot,
columns=c(1:(ncol(centroids)-1)),
scale="globalminmax",
alphaLines=0.2,
groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
ylab("Precipitation (in)") + ggtitle(paste('C0875.h',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))
Data Preparation Parameters
variable <- "TMPF"
nclusters <- 5
Vectorize C0875.h over the training and test partitions.
## CHANGE GRANULARITY REP.CODE
C0875.vector.h <- station.vectorize.hour(C0875,variable)
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h[,variable] <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins.
C0875.h$HR <- C0875.h$HR + 1
C0875.h[is.nan(C0875.h[,variable]),variable] <- NA
##END CHANGE GRANULARITY
vector <- station.vectorize(C0875.h,variable)
vector <- vector[,c(1:3,11:20)]
If TMPF is missing any hour value, the day is discarded.
vector <- vector[complete.cases(vector),]
Cluster with 5-means the dataset to obtain the daily profiles.
#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]
#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]
Define the istransition variable by observing if the previous day had a different clusterid than the current.
vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
if(vector$istransition[i-1] == vector$istransition[i])
vector$istransition[i] <- 0 #False for C++
else
vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.tmpfc <- vector
Sample 5 vectors.
kable(vector[sample(nrow(vector),5),])
| YEAR | MON | DAY | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | clusterid | istransition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 940 | 2014 | 2 | 9 | 71.58333 | 73.00000 | 74.25000 | 75.16667 | 76.16667 | 76.91667 | 78.33333 | 77.83333 | 78.08333 | 77.16667 | 4 | 1 |
| 8 | 2011 | 1 | 8 | 67.00000 | 67.41667 | 68.58333 | 70.25000 | 71.36364 | 71.08333 | 71.25000 | 71.41667 | 70.91667 | 70.66667 | 2 | 1 |
| 611 | 2013 | 1 | 31 | 69.08333 | 70.36364 | 71.69231 | 72.91667 | 73.16667 | 74.09091 | 74.25000 | 73.66667 | 72.91667 | 72.41667 | 3 | 1 |
| 956 | 2014 | 2 | 25 | 66.40000 | 67.00000 | 68.08333 | 70.66667 | 71.08333 | 72.00000 | 73.25000 | 72.16667 | 71.00000 | 70.16667 | 2 | 1 |
| 829 | 2013 | 10 | 17 | 75.87500 | 76.66667 | 78.16667 | 81.41667 | 80.83333 | 80.91667 | 82.75000 | 81.66667 | 80.58333 | 80.00000 | 5 | 1 |
Centroids.
centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
|---|---|---|---|---|---|---|---|---|---|
| 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 |
| 65.98596 | 67.52987 | 69.47197 | 70.46184 | 71.00570 | 71.15887 | 71.00725 | 70.30359 | 69.94046 | 69.45749 |
| 70.43443 | 71.56031 | 72.76930 | 73.75541 | 74.39211 | 74.90663 | 75.05766 | 74.91218 | 74.37025 | 73.60410 |
| 73.11687 | 74.51697 | 75.93278 | 76.99524 | 77.75652 | 78.12831 | 78.22402 | 78.08065 | 77.47524 | 76.55797 |
| 75.74037 | 77.34352 | 79.01742 | 80.16918 | 80.98724 | 81.61613 | 81.80857 | 81.53023 | 80.82750 | 79.83494 |
ggparcoord(
data = centroids_plot,
columns=c(1:(ncol(centroids)-1)),
scale="globalminmax",
alphaLines=0.2,
groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('C0875.h',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))
Data Preparation Parameters
variable <- "DWPF"
nclusters <- 5
Vectorize C0875.h over the training and test partitions.
## CHANGE GRANULARITY REP.CODE
C0875.vector.h <- station.vectorize.hour(C0875,variable)
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h[,variable] <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins.
C0875.h$HR <- C0875.h$HR + 1
C0875.h[is.nan(C0875.h[,variable]),variable] <- NA
##END CHANGE GRANULARITY
vector <- station.vectorize(C0875.h,variable)
vector <- vector[,c(1:3,11:20)]
If DWPF is missing any hour value, the day is discarded.
vector <- vector[complete.cases(vector),]
Cluster with 5-means the dataset to obtain the daily profiles.
#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]
#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]
Define the istransition variable by observing if the previous day had a different clusterid than the current.
vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
if(vector$istransition[i-1] == vector$istransition[i])
vector$istransition[i] <- 0 #False for C++
else
vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.dwpfc <- vector
Sample 5 vectors.
kable(vector[sample(nrow(vector),5),])
| YEAR | MON | DAY | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | clusterid | istransition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 943 | 2014 | 2 | 12 | 66.35000 | 66.56667 | 66.56667 | 68.64167 | 68.70833 | 67.80000 | 66.59167 | 65.06667 | 65.95833 | 66.76667 | 4 | 1 |
| 205 | 2011 | 8 | 14 | 66.25000 | 67.02222 | 67.13333 | 66.98889 | 67.32500 | 67.95556 | 67.64444 | 67.72500 | 68.10000 | 68.07000 | 4 | 1 |
| 620 | 2013 | 2 | 13 | 56.11667 | 57.08333 | 57.80833 | 56.55833 | 57.96667 | 57.35000 | 57.29167 | 57.91667 | 57.84167 | 58.25833 | 2 | 1 |
| 960 | 2014 | 3 | 1 | 65.73750 | 65.36667 | 65.27500 | 66.06667 | 66.42500 | 67.12500 | 65.78333 | 66.45833 | 66.00000 | 66.64167 | 4 | 1 |
| 833 | 2013 | 10 | 21 | 63.24000 | 63.24167 | 63.22500 | 64.36667 | 64.66667 | 66.00000 | 65.75000 | 65.75000 | 64.74167 | 65.05833 | 3 | 1 |
Centroids.
centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
|---|---|---|---|---|---|---|---|---|---|
| 50.36524 | 51.05823 | 51.28203 | 51.90283 | 52.51746 | 53.31268 | 53.49643 | 54.07300 | 54.04386 | 53.79800 |
| 59.13956 | 59.58633 | 59.90201 | 59.91526 | 60.04278 | 60.36267 | 60.52514 | 60.65857 | 60.85279 | 60.79912 |
| 63.71016 | 64.12099 | 64.41991 | 64.60124 | 64.64479 | 64.67152 | 64.66111 | 64.62758 | 64.63555 | 64.66560 |
| 66.75131 | 67.19017 | 67.42462 | 67.55822 | 67.66095 | 67.65834 | 67.57672 | 67.61297 | 67.61228 | 67.51312 |
| 69.76221 | 70.20099 | 70.53443 | 70.73323 | 70.74292 | 70.73599 | 70.78219 | 70.77175 | 70.63376 | 70.41722 |
ggparcoord(
data = centroids_plot,
columns=c(1:(ncol(centroids)-1)),
scale="globalminmax",
alphaLines=0.2,
groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('C0875.h',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))
Data Preparation Parameters
variable <- "RELH"
nclusters <- 5
Vectorize C0875.h over the training and test partitions.
## CHANGE GRANULARITY REP.CODE
C0875.vector.h <- station.vectorize.hour(C0875,variable)
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h[,variable] <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins.
C0875.h$HR <- C0875.h$HR + 1
C0875.h[is.nan(C0875.h[,variable]),variable] <- NA
##END CHANGE GRANULARITY
vector <- station.vectorize(C0875.h,variable)
vector <- vector[,c(1:3,11:20)]
If RELH is missing any hour value, the day is discarded.
vector <- vector[complete.cases(vector),]
Cluster with 5-means the dataset to obtain the daily profiles.
#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]
#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]
Define the istransition variable by observing if the previous day had a different clusterid than the current.
vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
if(vector$istransition[i-1] == vector$istransition[i])
vector$istransition[i] <- 0 #False for C++
else
vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.relh <- vector
Sample 5 vectors.
kable(vector[sample(nrow(vector),5),])
| YEAR | MON | DAY | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | clusterid | istransition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 940 | 2014 | 2 | 9 | 86.83333 | 81.75000 | 79.50000 | 77.50000 | 76.75000 | 75.00000 | 73.00000 | 74.66667 | 74.50000 | 74.08333 | 4 | 1 |
| 8 | 2011 | 1 | 8 | 45.91667 | 46.08333 | 46.00000 | 45.33333 | 45.09091 | 45.00000 | 45.33333 | 45.66667 | 45.91667 | 46.00000 | 2 | 1 |
| 611 | 2013 | 1 | 31 | 81.08333 | 80.36364 | 78.07692 | 75.83333 | 74.00000 | 73.09091 | 72.41667 | 73.25000 | 76.00000 | 76.58333 | 4 | 1 |
| 956 | 2014 | 2 | 25 | 88.10000 | 87.50000 | 82.75000 | 75.16667 | 72.66667 | 70.08333 | 68.16667 | 71.00000 | 71.41667 | 72.16667 | 4 | 1 |
| 829 | 2013 | 10 | 17 | 77.62500 | 76.25000 | 75.50000 | 69.25000 | 72.33333 | 70.16667 | 65.66667 | 67.91667 | 69.25000 | 68.83333 | 3 | 1 |
Centroids.
centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
|---|---|---|---|---|---|---|---|---|---|
| 8.675926 | 13.15741 | 15.88889 | 16.13889 | 17.30556 | 17.41667 | 16.29630 | 13.85185 | 13.33333 | 13.12963 |
| 69.414437 | 66.72594 | 63.07818 | 60.83641 | 59.30036 | 58.76455 | 58.89653 | 59.84157 | 61.49409 | 63.29273 |
| 76.780981 | 74.10610 | 71.03875 | 68.50534 | 66.92341 | 65.76468 | 65.27248 | 65.90217 | 67.35675 | 69.43879 |
| 81.029615 | 79.26991 | 76.89404 | 74.98854 | 73.41702 | 72.45610 | 72.15750 | 72.50476 | 73.94754 | 75.58106 |
| 87.538562 | 86.73306 | 85.43851 | 84.46789 | 84.07478 | 84.12490 | 83.66399 | 83.09817 | 83.32106 | 84.18066 |
ggparcoord(
data = centroids_plot,
columns=c(1:(ncol(centroids)-1)),
scale="globalminmax",
alphaLines=0.2,
groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('C0875.h',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))
Data Preparation Parameters
variable <- "SKNT"
nclusters <- 5
Vectorize C0875.h over the training and test partitions.
## CHANGE GRANULARITY REP.CODE
C0875.vector.h <- station.vectorize.hour(C0875,variable)
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h[,variable] <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins.
C0875.h$HR <- C0875.h$HR + 1
C0875.h[is.nan(C0875.h[,variable]),variable] <- NA
##END CHANGE GRANULARITY
vector <- station.vectorize(C0875.h,variable)
vector <- vector[,c(1:3,11:20)]
If SKNT is missing any hour value, the day is discarded.
vector <- vector[complete.cases(vector),]
Cluster with 5-means the dataset to obtain the daily profiles.
#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]
#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]
Define the istransition variable by observing if the previous day had a different clusterid than the current.
vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
if(vector$istransition[i-1] == vector$istransition[i])
vector$istransition[i] <- 0 #False for C++
else
vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.wind <- vector
Sample 5 vectors.
kable(vector[sample(nrow(vector),5),])
| YEAR | MON | DAY | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | clusterid | istransition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 840 | 2013 | 10 | 28 | 3.0000000 | 3.750000 | 4.500000 | 3.416667 | 3.750000 | 4.166667 | 3.545454 | 4.416667 | 7.333333 | 4.272727 | 4 | 1 |
| 10 | 2011 | 1 | 10 | 6.3333333 | 6.666667 | 7.666667 | 7.166667 | 6.500000 | 7.666667 | 8.083333 | 12.750000 | 21.583333 | 13.833333 | 2 | 1 |
| 297 | 2011 | 12 | 1 | 0.5833333 | 1.250000 | 1.416667 | 1.333333 | 1.916667 | 2.500000 | 3.166667 | 2.666667 | 3.833333 | 6.666667 | 4 | 1 |
| 867 | 2013 | 11 | 24 | 6.0000000 | 7.916667 | 6.250000 | 7.090909 | 6.818182 | 8.583333 | 6.416667 | 6.083333 | 5.750000 | 7.083333 | 2 | 1 |
| 708 | 2013 | 5 | 26 | 7.2500000 | 7.583333 | 8.833333 | 8.500000 | 9.333333 | 7.833333 | 8.000000 | 8.166667 | 8.916667 | 8.250000 | 2 | 1 |
Centroids.
centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
|---|---|---|---|---|---|---|---|---|---|
| 2.515120 | 2.801432 | 3.417928 | 3.787969 | 4.227666 | 4.363395 | 4.489103 | 4.543058 | 4.612481 | 4.239605 |
| 6.964374 | 7.476228 | 7.725313 | 7.883218 | 8.178154 | 8.140686 | 8.075538 | 7.940529 | 7.875768 | 7.810506 |
| 14.266923 | 11.546448 | 11.325633 | 14.400040 | 11.718023 | 11.658908 | 11.609163 | 11.833090 | 11.463684 | 11.407992 |
| 149.272222 | 6.819444 | 7.194444 | 8.166667 | 8.069444 | 8.055556 | 8.277778 | 8.458333 | 7.704546 | 8.333333 |
| 326.800000 | 23.832685 | 8.570707 | 9.333333 | 8.135101 | 8.558081 | 8.291667 | 7.666667 | 7.722222 | 7.236111 |
ggparcoord(
data = centroids_plot,
columns=c(1:(ncol(centroids)-1)),
scale="globalminmax",
alphaLines=0.2,
groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('C0875.h',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))
entropy <- read.table("~/Desktop/OCI/out/entropy-tsmissing.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
++ Solar Cluster x Solar Transition
+++ Data Preparation
solar <- vector.solar[,c("YEAR","MON","DAY","istransition")]
solarc <- vector.solar[,c("YEAR","MON","DAY","clusterid")]
write.table(vector.solar[,c("clusterid","istransition")],file="~/Desktop/OCI/data/solarCid_solarTransition.csv",col.names=FALSE,row.names=FALSE,sep=",")
+++ Summary
entropy <- read.table("~/Desktop/OCI/out/entropy-solarCid_solarTransition.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
++ Humidity Transition x Solar Transition
+++ Data Preparation
#Get the Humidity Transitions
relh <- vector.relh[,c("YEAR","MON","DAY","istransition")]
#Inner Join with the Solar Transitions
pair <- merge(relh,solar,by=c("YEAR","MON","DAY"))[,c("istransition.x","istransition.y")]
colnames(pair) <- c("relht","solart")
#Save
write.table(pair[,c("relht","solart")],file="~/Desktop/OCI/data/humidityTransition_SolarTransition.csv",col.names=FALSE,row.names=FALSE,sep=",")
+++ Summary
entropy <- read.table("~/Desktop/OCI/out/entropy-humidityTransition_SolarTransition.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
#Get the Humidity Transitions
relhc <- vector.relh[,c("YEAR","MON","DAY","clusterid")]
#Inner Join with the Solar and Humidity Centroids
pair <- merge(relhc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("relhc","solarc")
#Save
write.table(pair[,c("relhc","solarc")],file="~/Desktop/OCI/data/c0875_humidityCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")
entropy <- read.table("~/Desktop/OCI/out/entropy-c0875_humidityCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
#Get the Temperature Centroids
tmpfc <- vector.tmpfc[,c("YEAR","MON","DAY","clusterid")]
#Inner Join with the Solar and Temperature Centroids
pair <- merge(tmpfc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("tmpfc","solarc")
#Save
write.table(pair[,c("tmpfc","solarc")],file="~/Desktop/OCI/data/c0875_temperatureCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")
entropy <- read.table("~/Desktop/OCI/out/entropy-c0875_temperatureCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
#Get the Humidity Transitions
dwpfc <- vector.dwpfc[,c("YEAR","MON","DAY","clusterid")]
#Inner Join with the Solar and Humidity Centroids
pair <- merge(dwpfc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("dwpfc","solarc")
#Save
write.table(pair[,c("dwpfc","solarc")],file="~/Desktop/OCI/data/c0875_dewpointCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")
entropy <- read.table("~/Desktop/OCI/out/entropy-c0875_dewpointCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
#Get the Humidity Transitions
precc <- vector.prec[,c("YEAR","MON","DAY","clusterid")]
#Inner Join with the Solar and Humidity Centroids
pair <- merge(precc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("precc","solarc")
#Save
write.table(pair[,c("precc","solarc")],file="~/Desktop/OCI/data/c0875_pressureCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")
entropy <- read.table("~/Desktop/OCI/out/entropy-c0875_pressureCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
++ Wind Speed Transition x Solar Transition
+++ Data Preparation
#Get the wind Transitions
wind <- vector.wind[,c("YEAR","MON","DAY","istransition")]
#Inner Join with the Solar Transitions
pair <- merge(wind,solar,by=c("YEAR","MON","DAY"))[,c("istransition.x","istransition.y")]
colnames(pair) <- c("windt","solart")
#Save
write.table(pair[,c("windt","solart")],file="~/Desktop/OCI/data/windTransition_SolarTransition.csv",col.names=FALSE,row.names=FALSE,sep=",")
+++ Summary
entropy <- read.table("~/Desktop/OCI/out/entropy-windTransition_SolarTransition.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
#Get the wind Transitions
windc <- vector.wind[,c("YEAR","MON","DAY","clusterid")]
#Inner Join with the Solar Transitions
pair <- merge(windc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("windc","solarc")
#Save
write.table(pair[,c("windc","solarc")],file="~/Desktop/OCI/data/c0875_windCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")
entropy <- read.table("~/Desktop/OCI/out/entropy-c0875_windCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))