Load required libraries and plot template.
#Load Required Libraries
library(solarf)
library(knitr) #For pretty tables on the report using kable
library(plyr)
library(ggplot2)
library(GGally)
library(caret)
library(extrafont)
#Theme for Plot
science_theme =
theme(
panel.background=element_blank(),
#panel.grid.major = element_line(size = 0.5, color = "grey"),
#axis.line = element_line(size = 0.7, color = "black"),
#legend.position = c(0.9, 0.7),
#text = element_text(size = 14),
axis.line=element_blank(),
text = element_text(size=15, family="Interstate"),
axis.text.x = element_text(hjust = 1, size=15,color="black"),
axis.text.y = element_text(hjust = 1, size=15,color="black"),
axis.title.y=element_text(size=15),
axis.title.x=element_text(size=15),
axis.ticks = element_line(colour=NA),
panel.grid.major = element_line(colour = 'gray', linetype = 'dashed'),
panel.grid.minor = element_line(colour = NA),
panel.background = element_rect(colour = 'white'))
Data Preparation Parameters
variable <- "SOLR"
nclusters <- 5
SCBH1 <- readRDS("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/SCBH1.rds")
Vectorize SCBH1 over the training and test partitions.
vector <- station.vectorize(SCBH1,variable)
vector <- vector[,c(1:3,12:21)]
If SOLR is missing any hour value, the day is discarded.
vector <- vector[complete.cases(vector),]
Cluster with 5-means the dataset to obtain the daily profiles.
#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]
#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]
Define the istransition variable by observing if the previous day had a different clusterid than the current.
vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
if(vector$istransition[i-1] == vector$istransition[i])
vector$istransition[i] <- 0 #False for C++
else
vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.solar <- vector
Sample 5 vectors.
kable(vector[sample(nrow(vector),5),])
| YEAR | MON | DAY | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | clusterid | istransition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2853 | 2011 | 8 | 20 | 64 | 140 | 134 | 149 | 201 | 188 | 146 | 211 | 129 | 81 | 4 | 1 |
| 38 | 2003 | 2 | 7 | 372 | 620 | 736 | 826 | 850 | 793 | 685 | 497 | 280 | 60 | 2 | 1 |
| 959 | 2006 | 3 | 24 | 199 | 529 | 929 | 995 | 726 | 501 | 333 | 218 | 181 | 59 | 2 | 1 |
| 2939 | 2011 | 11 | 15 | 113 | 146 | 185 | 207 | 250 | 264 | 149 | 227 | 73 | 7 | 4 | 1 |
| 1932 | 2008 | 12 | 18 | 62 | 116 | 296 | 304 | 260 | 565 | 472 | 449 | 134 | 16 | 4 | 1 |
Centroids.
centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
|---|---|---|---|---|---|---|---|---|---|
| 96.48299 | 147.0476 | 185.7497 | 209.1020 | 225.3701 | 235.7184 | 240.3211 | 238.3252 | 202.2259 | 109.97551 |
| 265.63504 | 416.2701 | 524.8015 | 536.3723 | 466.2657 | 392.3810 | 337.9927 | 257.6292 | 170.9825 | 70.75328 |
| 179.27831 | 312.7568 | 434.9427 | 547.3954 | 651.0064 | 687.7654 | 639.9725 | 530.1320 | 341.7138 | 146.76885 |
| 359.17192 | 574.7114 | 732.5726 | 803.5174 | 768.3454 | 669.8628 | 540.4795 | 396.4653 | 258.4795 | 108.48107 |
| 357.29153 | 573.5679 | 752.0663 | 885.0358 | 954.2930 | 941.3826 | 841.5112 | 675.5182 | 450.1414 | 200.40016 |
ggparcoord(
data = centroids_plot,
columns=c(1:(ncol(centroids)-1)),
scale="globalminmax",
alphaLines=0.2,
groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('SCBH1',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))
Data Preparation Parameters
variable <- "PREC"
nclusters <- 5
Vectorize SCBH1 over the training and test partitions.
vector <- station.vectorize(SCBH1,variable)
vector <- vector[,c(1:3,12:21)]
If PREC is missing any hour value, the day is discarded.
vector <- vector[complete.cases(vector),]
#Set seed for reproducibility
set.seed(1234)
#Mean precipitation of each day
mean.prec <- rowMeans(vector[,4:ncol(vector)])
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.1.3
## Loading required package: grid
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.1.3
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.1.3
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:plyr':
##
## is.discrete, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
#Min and max
bins <- cut(mean.prec,breaks=5)
interval_labels <- levels(cut(mean.prec,breaks=5))
bin_interval_mean <- function(interval_label){
lower_and_upper <- unlist(strsplit(interval_label, ",", fixed = TRUE))
lower <- as.numeric(substr(lower_and_upper[1],start=2,stop=nchar(lower_and_upper[1])))
upper <- as.numeric(substr(lower_and_upper[2],start=1,stop=nchar(lower_and_upper[2])-1))
return (mean(c(lower,upper)))
}
centroids[1,1:ncol(centroids)] <- bin_interval_mean(interval_labels[1])
centroids[2,1:ncol(centroids)] <- bin_interval_mean(interval_labels[2])
centroids[3,1:ncol(centroids)] <- bin_interval_mean(interval_labels[3])
centroids[4,1:ncol(centroids)] <- bin_interval_mean(interval_labels[4])
centroids[5,1:ncol(centroids)] <- bin_interval_mean(interval_labels[5])
vector$clusterid <- bins
#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]
Define the istransition variable by observing if the previous day had a different clusterid than the current.
vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
if(vector$istransition[i-1] == vector$istransition[i])
vector$istransition[i] <- 0 #False for C++
else
vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.prec <- vector
Sample 5 vectors.
kable(vector[sample(nrow(vector),5),])
| YEAR | MON | DAY | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | clusterid | istransition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 598 | 2005 | 3 | 25 | 34.11 | 34.12 | 34.13 | 34.13 | 34.13 | 34.32 | 34.34 | 34.34 | 34.39 | 34.49 | 2 | 1 |
| 2592 | 2010 | 10 | 19 | 15.42 | 15.42 | 15.42 | 15.42 | 15.42 | 15.42 | 15.42 | 15.42 | 15.42 | 15.42 | 1 | 1 |
| 2543 | 2010 | 8 | 31 | 13.70 | 13.70 | 13.70 | 13.70 | 13.70 | 13.70 | 13.70 | 13.70 | 13.70 | 13.70 | 1 | 0 |
| 2595 | 2010 | 10 | 22 | 15.42 | 15.42 | 15.42 | 15.42 | 15.42 | 15.42 | 15.42 | 15.42 | 15.42 | 15.52 | 1 | 0 |
| 3543 | 2013 | 7 | 12 | 29.59 | 29.59 | 29.59 | 29.59 | 29.59 | 29.59 | 29.60 | 29.60 | 29.61 | 29.62 | 2 | 1 |
Centroids.
centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
|---|---|---|---|---|---|---|---|---|---|
| 10.0495 | 10.0495 | 10.0495 | 10.0495 | 10.0495 | 10.0495 | 10.0495 | 10.0495 | 10.0495 | 10.0495 |
| 30.3000 | 30.3000 | 30.3000 | 30.3000 | 30.3000 | 30.3000 | 30.3000 | 30.3000 | 30.3000 | 30.3000 |
| 50.5000 | 50.5000 | 50.5000 | 50.5000 | 50.5000 | 50.5000 | 50.5000 | 50.5000 | 50.5000 | 50.5000 |
| 70.7000 | 70.7000 | 70.7000 | 70.7000 | 70.7000 | 70.7000 | 70.7000 | 70.7000 | 70.7000 | 70.7000 |
| 90.9000 | 90.9000 | 90.9000 | 90.9000 | 90.9000 | 90.9000 | 90.9000 | 90.9000 | 90.9000 | 90.9000 |
ggparcoord(
data = centroids_plot,
columns=c(1:(ncol(centroids)-1)),
scale="globalminmax",
alphaLines=0.2,
groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
ylab("Precipitation (in)") + ggtitle(paste('SCBH1',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))
Data Preparation Parameters
variable <- "TMPF"
nclusters <- 5
Vectorize SCBH1 over the training and test partitions.
vector <- station.vectorize(SCBH1,variable)
vector <- vector[,c(1:3,12:21)]
If TMPF is missing any hour value, the day is discarded.
vector <- vector[complete.cases(vector),]
Cluster with 5-means the dataset to obtain the daily profiles.
#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]
#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]
Define the istransition variable by observing if the previous day had a different clusterid than the current.
vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
if(vector$istransition[i-1] == vector$istransition[i])
vector$istransition[i] <- 0 #False for C++
else
vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.tmpfc <- vector
Sample 5 vectors.
kable(vector[sample(nrow(vector),5),])
| YEAR | MON | DAY | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | clusterid | istransition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2676 | 2011 | 1 | 24 | 65 | 74 | 75 | 74 | 73 | 75 | 78 | 75 | 79 | 72 | 5 | 1 |
| 43 | 2003 | 2 | 12 | 67 | 69 | 72 | 74 | 77 | 77 | 73 | 72 | 69 | 69 | 5 | 1 |
| 1088 | 2006 | 8 | 27 | 79 | 79 | 82 | 80 | 80 | 75 | 81 | 84 | 84 | 83 | 2 | 1 |
| 2779 | 2011 | 5 | 7 | 69 | 67 | 66 | 70 | 70 | 72 | 72 | 71 | 69 | 67 | 1 | 1 |
| 2176 | 2009 | 8 | 19 | 80 | 82 | 81 | 84 | 79 | 84 | 82 | 85 | 83 | 78 | 2 | 1 |
Centroids.
centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
|---|---|---|---|---|---|---|---|---|---|
| 67.13850 | 68.49307 | 69.43490 | 69.95014 | 70.36288 | 70.27978 | 69.75069 | 69.44875 | 68.65651 | 67.13573 |
| 69.35815 | 72.10112 | 73.48034 | 74.34129 | 74.50421 | 74.40590 | 74.03511 | 73.20365 | 72.16433 | 70.33427 |
| 72.90111 | 75.43667 | 76.67667 | 77.17444 | 77.31222 | 77.10889 | 76.58000 | 75.60333 | 74.27556 | 72.24000 |
| 76.27563 | 77.96518 | 79.05609 | 79.55416 | 79.69439 | 79.51451 | 79.10928 | 78.37041 | 77.02998 | 75.00193 |
| 78.60000 | 80.42192 | 81.70411 | 82.57808 | 82.81918 | 82.69452 | 82.26712 | 81.31370 | 79.62603 | 77.17808 |
ggparcoord(
data = centroids_plot,
columns=c(1:(ncol(centroids)-1)),
scale="globalminmax",
alphaLines=0.2,
groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('SCBH1',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))
Data Preparation Parameters
variable <- "DWPF"
nclusters <- 5
Vectorize SCBH1 over the training and test partitions.
vector <- station.vectorize(SCBH1,variable)
vector <- vector[,c(1:3,12:21)]
If DWPF is missing any hour value, the day is discarded.
vector <- vector[complete.cases(vector),]
Cluster with 5-means the dataset to obtain the daily profiles.
#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]
#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]
Define the istransition variable by observing if the previous day had a different clusterid than the current.
vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
if(vector$istransition[i-1] == vector$istransition[i])
vector$istransition[i] <- 0 #False for C++
else
vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.dwpfc <- vector
Sample 5 vectors.
kable(vector[sample(nrow(vector),5),])
| YEAR | MON | DAY | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | clusterid | istransition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2675 | 2011 | 1 | 23 | 59.0 | 58.0 | 60.5 | 57.7 | 58.2 | 59.6 | 62.0 | 60.8 | 60.6 | 60.0 | 2 | 1 |
| 43 | 2003 | 2 | 12 | 62.7 | 60.4 | 59.6 | 60.1 | 58.5 | 56.9 | 56.4 | 58.7 | 57.2 | 58.4 | 2 | 1 |
| 1086 | 2006 | 8 | 25 | 65.8 | 62.1 | 63.0 | 64.6 | 65.9 | 65.1 | 65.1 | 66.2 | 66.2 | 65.2 | 4 | 1 |
| 2777 | 2011 | 5 | 5 | 63.2 | 64.6 | 65.3 | 64.1 | 62.0 | 62.9 | 65.1 | 63.4 | 64.5 | 63.8 | 4 | 1 |
| 2174 | 2009 | 8 | 17 | 66.6 | 67.2 | 65.2 | 64.7 | 63.0 | 64.5 | 64.6 | 63.8 | 63.4 | 65.3 | 4 | 1 |
Centroids.
centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
|---|---|---|---|---|---|---|---|---|---|
| 56.47021 | 56.11543 | 55.42181 | 55.10904 | 55.12606 | 55.02553 | 54.84309 | 54.98723 | 55.25426 | 55.26862 |
| 62.44886 | 61.62593 | 60.80012 | 60.42569 | 60.23385 | 60.23157 | 60.09952 | 60.20312 | 60.31441 | 60.41080 |
| 65.61374 | 64.79147 | 64.23080 | 63.84224 | 63.60427 | 63.56928 | 63.52449 | 63.53191 | 63.53191 | 63.71561 |
| 68.31388 | 67.76972 | 67.22666 | 66.99829 | 66.98410 | 66.95231 | 66.96891 | 66.89527 | 66.96036 | 66.97736 |
| 71.74212 | 71.39322 | 70.98993 | 70.86593 | 71.08590 | 71.11685 | 71.01172 | 70.87692 | 70.70989 | 70.48535 |
ggparcoord(
data = centroids_plot,
columns=c(1:(ncol(centroids)-1)),
scale="globalminmax",
alphaLines=0.2,
groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('SCBH1',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))
Data Preparation Parameters
variable <- "RELH"
nclusters <- 5
Vectorize SCBH1 over the training and test partitions.
vector <- station.vectorize(SCBH1,variable)
vector <- vector[,c(1:3,12:21)]
If RELH is missing any hour value, the day is discarded.
vector <- vector[complete.cases(vector),]
Cluster with 5-means the dataset to obtain the daily profiles.
#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]
#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]
Define the istransition variable by observing if the previous day had a different clusterid than the current.
vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
if(vector$istransition[i-1] == vector$istransition[i])
vector$istransition[i] <- 0 #False for C++
else
vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.relh <- vector
Sample 5 vectors.
kable(vector[sample(nrow(vector),5),])
| YEAR | MON | DAY | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | clusterid | istransition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2674 | 2011 | 1 | 22 | 92 | 70 | 63 | 57 | 54 | 53 | 56 | 56 | 60 | 68 | 2 | 1 |
| 43 | 2003 | 2 | 12 | 86 | 74 | 65 | 62 | 53 | 50 | 56 | 63 | 66 | 69 | 2 | 1 |
| 1088 | 2006 | 8 | 27 | 75 | 68 | 61 | 62 | 63 | 98 | 73 | 67 | 63 | 53 | 2 | 1 |
| 2776 | 2011 | 5 | 4 | 82 | 70 | 73 | 68 | 69 | 65 | 68 | 67 | 74 | 83 | 1 | 1 |
| 2175 | 2009 | 8 | 18 | 79 | 100 | 91 | 57 | 53 | 50 | 59 | 61 | 69 | 72 | 1 | 0 |
Centroids.
centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
|---|---|---|---|---|---|---|---|---|---|
| 65.57185 | 58.77502 | 55.11298 | 52.95144 | 52.12983 | 52.01685 | 52.52032 | 54.11893 | 57.10505 | 62.23092 |
| 76.48750 | 68.47768 | 63.97768 | 61.47321 | 60.60893 | 60.65268 | 61.32589 | 63.16429 | 66.18750 | 71.51071 |
| 88.66667 | 82.81530 | 77.44012 | 73.56277 | 71.20924 | 69.55988 | 69.36219 | 70.34921 | 73.37374 | 78.84993 |
| 78.82600 | 70.20755 | 66.13627 | 66.55975 | 69.77149 | 74.46960 | 78.07547 | 81.62474 | 85.66876 | 89.92243 |
| 94.30963 | 91.15138 | 88.83716 | 87.88991 | 87.81422 | 88.42202 | 88.97477 | 89.85321 | 90.91743 | 94.12156 |
ggparcoord(
data = centroids_plot,
columns=c(1:(ncol(centroids)-1)),
scale="globalminmax",
alphaLines=0.2,
groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('SCBH1',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))
Data Preparation Parameters
variable <- "SKNT"
nclusters <- 5
Vectorize SCBH1 over the training and test partitions.
vector <- station.vectorize(SCBH1,variable)
vector <- vector[,c(1:3,12:21)]
If SKNT is missing any hour value, the day is discarded.
vector <- vector[complete.cases(vector),]
Cluster with 5-means the dataset to obtain the daily profiles.
#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]
#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]
Define the istransition variable by observing if the previous day had a different clusterid than the current.
vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
if(vector$istransition[i-1] == vector$istransition[i])
vector$istransition[i] <- 0 #False for C++
else
vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.wind <- vector
Sample 5 vectors.
kable(vector[sample(nrow(vector),5),])
| YEAR | MON | DAY | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | clusterid | istransition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2501 | 2010 | 7 | 20 | 3 | 5 | 8 | 9 | 11 | 12 | 11 | 10 | 9 | 8 | 5 | 1 |
| 40 | 2003 | 2 | 9 | 1 | 2 | 6 | 5 | 6 | 5 | 6 | 3 | 4 | 7 | 1 | 1 |
| 1002 | 2006 | 6 | 2 | 5 | 2 | 4 | 5 | 9 | 7 | 10 | 8 | 8 | 7 | 4 | 1 |
| 2588 | 2010 | 10 | 15 | 3 | 10 | 6 | 9 | 8 | 8 | 8 | 8 | 7 | 6 | 4 | 1 |
| 2047 | 2009 | 4 | 12 | 7 | 7 | 9 | 7 | 11 | 11 | 9 | 8 | 6 | 7 | 5 | 1 |
Centroids.
centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
|---|---|---|---|---|---|---|---|---|---|
| 3.229682 | 4.238516 | 4.935512 | 5.522968 | 5.802120 | 5.905477 | 5.725265 | 5.361307 | 4.752650 | 3.852473 |
| 4.404416 | 6.048580 | 7.101577 | 7.807571 | 8.184227 | 8.439748 | 8.328707 | 7.994953 | 7.157098 | 5.984227 |
| 6.879090 | 8.610242 | 9.785206 | 10.186344 | 10.492176 | 10.550498 | 10.302987 | 9.911807 | 9.012802 | 7.752489 |
| 48.045454 | 45.136364 | 50.772727 | 59.363636 | 66.227273 | 56.090909 | 56.090909 | 48.227273 | 47.545454 | 47.318182 |
| 58.461539 | 78.846154 | 87.846154 | 102.153846 | 99.230769 | 95.615385 | 88.923077 | 92.307692 | 80.307692 | 69.692308 |
ggparcoord(
data = centroids_plot,
columns=c(1:(ncol(centroids)-1)),
scale="globalminmax",
alphaLines=0.2,
groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('SCBH1',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))
entropy <- read.table("~/Desktop/OCI/out/entropy-tsmissing.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
++ Solar Cluster x Solar Transition
+++ Data Preparation
solar <- vector.solar[,c("YEAR","MON","DAY","istransition")]
solarc <- vector.solar[,c("YEAR","MON","DAY","clusterid")]
write.table(vector.solar[,c("clusterid","istransition")],file="~/Desktop/OCI/data/solarCid_solarTransition.csv",col.names=FALSE,row.names=FALSE,sep=",")
+++ Summary
entropy <- read.table("~/Desktop/OCI/out/entropy-solarCid_solarTransition.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
++ Humidity Transition x Solar Transition
+++ Data Preparation
#Get the Humidity Transitions
relh <- vector.relh[,c("YEAR","MON","DAY","istransition")]
#Inner Join with the Solar Transitions
pair <- merge(relh,solar,by=c("YEAR","MON","DAY"))[,c("istransition.x","istransition.y")]
colnames(pair) <- c("relht","solart")
#Save
write.table(pair[,c("relht","solart")],file="~/Desktop/OCI/data/humidityTransition_SolarTransition.csv",col.names=FALSE,row.names=FALSE,sep=",")
+++ Summary
entropy <- read.table("~/Desktop/OCI/out/entropy-humidityTransition_SolarTransition.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
#Get the Humidity Transitions
relhc <- vector.relh[,c("YEAR","MON","DAY","clusterid")]
#Inner Join with the Solar and Humidity Centroids
pair <- merge(relhc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("relhc","solarc")
#Save
write.table(pair[,c("relhc","solarc")],file="~/Desktop/OCI/data/humidityCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")
entropy <- read.table("~/Desktop/OCI/out/entropy-humidityCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
#Get the Temperature Centroids
tmpfc <- vector.tmpfc[,c("YEAR","MON","DAY","clusterid")]
#Inner Join with the Solar and Temperature Centroids
pair <- merge(tmpfc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("tmpfc","solarc")
#Save
write.table(pair[,c("tmpfc","solarc")],file="~/Desktop/OCI/data/temperatureCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")
entropy <- read.table("~/Desktop/OCI/out/entropy-temperatureCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
#Get the Humidity Transitions
dwpfc <- vector.dwpfc[,c("YEAR","MON","DAY","clusterid")]
#Inner Join with the Solar and Humidity Centroids
pair <- merge(dwpfc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("dwpfc","solarc")
#Save
write.table(pair[,c("dwpfc","solarc")],file="~/Desktop/OCI/data/dewpointCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")
entropy <- read.table("~/Desktop/OCI/out/entropy-dewpointCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
#Get the Humidity Transitions
precc <- vector.prec[,c("YEAR","MON","DAY","clusterid")]
#Inner Join with the Solar and Humidity Centroids
pair <- merge(precc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("precc","solarc")
#Save
write.table(pair[,c("precc","solarc")],file="~/Desktop/OCI/data/precipitationCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")
entropy <- read.table("~/Desktop/OCI/out/entropy-precipitationCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
++ Wind Speed Transition x Solar Transition
+++ Data Preparation
#Get the wind Transitions
wind <- vector.wind[,c("YEAR","MON","DAY","istransition")]
#Inner Join with the Solar Transitions
pair <- merge(wind,solar,by=c("YEAR","MON","DAY"))[,c("istransition.x","istransition.y")]
colnames(pair) <- c("windt","solart")
#Save
write.table(pair[,c("windt","solart")],file="~/Desktop/OCI/data/windTransition_SolarTransition.csv",col.names=FALSE,row.names=FALSE,sep=",")
+++ Summary
entropy <- read.table("~/Desktop/OCI/out/entropy-windTransition_SolarTransition.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))
#Get the wind Transitions
windc <- vector.wind[,c("YEAR","MON","DAY","clusterid")]
#Inner Join with the Solar Transitions
pair <- merge(windc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("windc","solarc")
#Save
write.table(pair[,c("windc","solarc")],file="~/Desktop/OCI/data/windCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")
entropy <- read.table("~/Desktop/OCI/out/entropy-windCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))