Required Libraries

Load required libraries and plot template.

#Load Required Libraries
library(solarf)
library(knitr) #For pretty tables on the report using kable 
library(plyr)
library(ggplot2)
library(GGally)
library(caret)
library(extrafont)
#Theme for Plot
science_theme = 

theme(
    panel.background=element_blank(), 
    #panel.grid.major = element_line(size = 0.5, color = "grey"),
    #axis.line = element_line(size = 0.7, color = "black"), 
    #legend.position = c(0.9, 0.7), 
    #text = element_text(size = 14),
    axis.line=element_blank(),
    text = element_text(size=15, family="Interstate"),
    axis.text.x = element_text(hjust = 1, size=15,color="black"), 
    axis.text.y = element_text(hjust = 1, size=15,color="black"), 
    axis.title.y=element_text(size=15), 
    axis.title.x=element_text(size=15),
    axis.ticks = element_line(colour=NA),
    panel.grid.major = element_line(colour = 'gray', linetype = 'dashed'),
    panel.grid.minor = element_line(colour = NA),  
    panel.background = element_rect(colour = 'white'))

Data Preparation Parameters

variable <- "SOLR"
nclusters <- 5

SOLR

#C0875.h <- readRDS("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875.h.rds")
C0875 <- read.csv("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875_2011_2014.csv")


## CHANGE GRANULARITY FUNCTION
station.vectorize.hour <- function(station.dataframe,variable){
    library(reshape2)
    use.only.last.ocurrence <- function(minute.obs){
        return (minute.obs[1])
    }
    
    station.dataframe.vectorized <- dcast(data=station.dataframe, formula=YEAR + MON + DAY + HR ~ MIN,value.var = variable,fun.aggregate=use.only.last.ocurrence)
    return (station.dataframe.vectorized)
}
##

## CHANGE GRANULARITY  REP.CODE
C0875.vector.h <- station.vectorize.hour(C0875,variable)
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h[,variable] <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins. 
C0875.h$HR <- C0875.h$HR + 1
C0875.h[is.nan(C0875.h[,variable]),variable] <- NA
##END CHANGE GRANULARITY

Vectorization

Vectorize C0875.h over the training and test partitions.

vector <- station.vectorize(C0875.h,variable)
vector <- vector[,c(1:3,11:20)]

Cleansing

If SOLR is missing any hour value, the day is discarded.

vector <- vector[complete.cases(vector),]

Discretization 5-means

Cluster with 5-means the dataset to obtain the daily profiles.

#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]

#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)

#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]

Transitions

Define the istransition variable by observing if the previous day had a different clusterid than the current.

vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
    if(vector$istransition[i-1] == vector$istransition[i])
        vector$istransition[i] <- 0 #False for C++
    else
        vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.solar <- vector

Summary

Sample 5 vectors.

kable(vector[sample(nrow(vector),5),])
YEAR MON DAY 8 9 10 11 12 13 14 15 16 17 clusterid istransition
956 2014 2 25 58.70000 153.0000 290.0833 743.4167 591.4167 545.7500 407.2500 244.1667 215.5833 266.0833 3 1
433 2012 4 29 219.50000 530.9167 800.2500 547.0833 627.7500 740.2500 820.0000 859.8333 632.8333 291.0000 4 1
644 2013 3 9 60.91667 153.0833 254.4167 375.0000 692.0000 674.0000 1003.8333 726.9167 615.0833 490.0000 4 1
970 2014 3 11 174.00000 414.6667 613.9167 696.8333 879.0000 942.4167 913.0833 815.2500 657.5833 451.0000 5 1
850 2013 11 7 158.77778 382.0000 574.5833 758.3333 771.3333 446.0833 268.2727 378.6667 209.5833 212.1000 1 0

Centroids.

centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
8 9 10 11 12 13 14 15 16 17
41.81761 125.2847 234.6747 260.8534 248.8749 251.4028 259.4250 263.0444 205.4177 132.6385
86.53531 231.7437 383.0082 489.5958 502.4530 550.8664 545.9115 442.7662 321.3274 184.9058
114.53358 287.3950 491.2309 664.8130 753.6040 744.1934 620.3279 468.9166 348.7987 211.3311
129.88862 305.6151 488.8596 631.5216 744.7629 814.8602 811.9553 720.0813 535.4632 317.5342
206.28015 417.7976 638.0186 826.6715 949.2170 977.1658 941.4765 847.7259 664.0621 448.9646
ggparcoord(
  data = centroids_plot,
  columns=c(1:(ncol(centroids)-1)),
  scale="globalminmax",
  alphaLines=0.2,
  groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
  ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('C0875.h',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))

Data Preparation Parameters

variable <- "ALTI"
nclusters <- 5

ALTI

Vectorization

Vectorize C0875.h over the training and test partitions.

## CHANGE GRANULARITY  REP.CODE
C0875.vector.h <- station.vectorize.hour(C0875,variable)
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h[,variable] <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins. 
C0875.h$HR <- C0875.h$HR + 1
C0875.h[is.nan(C0875.h[,variable]),variable] <- NA
##END CHANGE GRANULARITY

vector <- station.vectorize(C0875.h,variable)
vector <- vector[,c(1:3,11:20)]

Cleansing

If ALTI is missing any hour value, the day is discarded.

vector <- vector[complete.cases(vector),]

Discretization Binning (since Centroids are constant lines)

#Set seed for reproducibility
set.seed(1234)

#Mean precipitation of each day
mean.prec <- rowMeans(vector[,4:ncol(vector)])

library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.1.3
## Loading required package: grid
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.1.3
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.1.3
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:plyr':
## 
##     is.discrete, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
#Min and max
bins <- cut(mean.prec,breaks=5)
interval_labels <- levels(cut(mean.prec,breaks=5))

bin_interval_mean <- function(interval_label){
    lower_and_upper <- unlist(strsplit(interval_label, ",", fixed = TRUE))   
    lower <- as.numeric(substr(lower_and_upper[1],start=2,stop=nchar(lower_and_upper[1])))
    upper <- as.numeric(substr(lower_and_upper[2],start=1,stop=nchar(lower_and_upper[2])-1))
    return (mean(c(lower,upper)))
}

centroids[1,1:ncol(centroids)] <- bin_interval_mean(interval_labels[1])
centroids[2,1:ncol(centroids)] <- bin_interval_mean(interval_labels[2])
centroids[3,1:ncol(centroids)] <- bin_interval_mean(interval_labels[3])
centroids[4,1:ncol(centroids)] <- bin_interval_mean(interval_labels[4])
centroids[5,1:ncol(centroids)] <- bin_interval_mean(interval_labels[5])

vector$clusterid <- bins

#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)

#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]

Transitions

Define the istransition variable by observing if the previous day had a different clusterid than the current.

vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
    if(vector$istransition[i-1] == vector$istransition[i])
        vector$istransition[i] <- 0 #False for C++
    else
        vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.prec <- vector

Summary

Sample 5 vectors.

kable(vector[sample(nrow(vector),5),])
YEAR MON DAY 8 9 10 11 12 13 14 15 16 17 clusterid istransition
154 2011 6 13 30.07333 30.0900 30.09500 30.10167 30.09889 30.09333 30.08583 30.07000 30.07000 30.06833 4 1
783 2013 8 28 30.01000 30.0100 30.01000 29.99833 29.98333 29.96667 29.95250 29.94000 29.93667 29.93000 3 1
763 2013 7 24 30.00583 30.0100 30.00667 29.99500 29.98917 29.98083 29.97000 29.95455 29.94091 29.93545 3 1
782 2013 8 27 30.10400 30.1100 30.11000 30.11000 30.09500 30.07000 30.04833 30.03083 30.01333 30.01083 4 1
1063 2014 7 12 30.00000 30.0025 30.00000 29.99583 29.98583 29.98000 29.97250 29.96667 29.96000 29.96000 3 1

Centroids.

centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
8 9 10 11 12 13 14 15 16 17
29.810 29.810 29.810 29.810 29.810 29.810 29.810 29.810 29.810 29.810
29.905 29.905 29.905 29.905 29.905 29.905 29.905 29.905 29.905 29.905
29.995 29.995 29.995 29.995 29.995 29.995 29.995 29.995 29.995 29.995
30.090 30.090 30.090 30.090 30.090 30.090 30.090 30.090 30.090 30.090
30.185 30.185 30.185 30.185 30.185 30.185 30.185 30.185 30.185 30.185
ggparcoord(
  data = centroids_plot,
  columns=c(1:(ncol(centroids)-1)),
  scale="globalminmax",
  alphaLines=0.2,
  groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
  ylab("Precipitation (in)") + ggtitle(paste('C0875.h',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))

Data Preparation Parameters

variable <- "TMPF"
nclusters <- 5

TMPF

Vectorization

Vectorize C0875.h over the training and test partitions.

## CHANGE GRANULARITY  REP.CODE
C0875.vector.h <- station.vectorize.hour(C0875,variable)
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h[,variable] <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins. 
C0875.h$HR <- C0875.h$HR + 1
C0875.h[is.nan(C0875.h[,variable]),variable] <- NA
##END CHANGE GRANULARITY

vector <- station.vectorize(C0875.h,variable)
vector <- vector[,c(1:3,11:20)]

Cleansing

If TMPF is missing any hour value, the day is discarded.

vector <- vector[complete.cases(vector),]

Discretization 5-means

Cluster with 5-means the dataset to obtain the daily profiles.

#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]

#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)

#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]

Transitions

Define the istransition variable by observing if the previous day had a different clusterid than the current.

vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
    if(vector$istransition[i-1] == vector$istransition[i])
        vector$istransition[i] <- 0 #False for C++
    else
        vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.tmpfc <- vector

Summary

Sample 5 vectors.

kable(vector[sample(nrow(vector),5),])
YEAR MON DAY 8 9 10 11 12 13 14 15 16 17 clusterid istransition
940 2014 2 9 71.58333 73.00000 74.25000 75.16667 76.16667 76.91667 78.33333 77.83333 78.08333 77.16667 4 1
8 2011 1 8 67.00000 67.41667 68.58333 70.25000 71.36364 71.08333 71.25000 71.41667 70.91667 70.66667 2 1
611 2013 1 31 69.08333 70.36364 71.69231 72.91667 73.16667 74.09091 74.25000 73.66667 72.91667 72.41667 3 1
956 2014 2 25 66.40000 67.00000 68.08333 70.66667 71.08333 72.00000 73.25000 72.16667 71.00000 70.16667 2 1
829 2013 10 17 75.87500 76.66667 78.16667 81.41667 80.83333 80.91667 82.75000 81.66667 80.58333 80.00000 5 1

Centroids.

centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
8 9 10 11 12 13 14 15 16 17
0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
65.98596 67.52987 69.47197 70.46184 71.00570 71.15887 71.00725 70.30359 69.94046 69.45749
70.43443 71.56031 72.76930 73.75541 74.39211 74.90663 75.05766 74.91218 74.37025 73.60410
73.11687 74.51697 75.93278 76.99524 77.75652 78.12831 78.22402 78.08065 77.47524 76.55797
75.74037 77.34352 79.01742 80.16918 80.98724 81.61613 81.80857 81.53023 80.82750 79.83494
ggparcoord(
  data = centroids_plot,
  columns=c(1:(ncol(centroids)-1)),
  scale="globalminmax",
  alphaLines=0.2,
  groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
  ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('C0875.h',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))

Data Preparation Parameters

variable <- "DWPF"
nclusters <- 5

DWPF

Vectorization

Vectorize C0875.h over the training and test partitions.

## CHANGE GRANULARITY  REP.CODE
C0875.vector.h <- station.vectorize.hour(C0875,variable)
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h[,variable] <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins. 
C0875.h$HR <- C0875.h$HR + 1
C0875.h[is.nan(C0875.h[,variable]),variable] <- NA
##END CHANGE GRANULARITY

vector <- station.vectorize(C0875.h,variable)
vector <- vector[,c(1:3,11:20)]

Cleansing

If DWPF is missing any hour value, the day is discarded.

vector <- vector[complete.cases(vector),]

Discretization 5-means

Cluster with 5-means the dataset to obtain the daily profiles.

#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]

#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)

#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]

Transitions

Define the istransition variable by observing if the previous day had a different clusterid than the current.

vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
    if(vector$istransition[i-1] == vector$istransition[i])
        vector$istransition[i] <- 0 #False for C++
    else
        vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.dwpfc <- vector

Summary

Sample 5 vectors.

kable(vector[sample(nrow(vector),5),])
YEAR MON DAY 8 9 10 11 12 13 14 15 16 17 clusterid istransition
943 2014 2 12 66.35000 66.56667 66.56667 68.64167 68.70833 67.80000 66.59167 65.06667 65.95833 66.76667 4 1
205 2011 8 14 66.25000 67.02222 67.13333 66.98889 67.32500 67.95556 67.64444 67.72500 68.10000 68.07000 4 1
620 2013 2 13 56.11667 57.08333 57.80833 56.55833 57.96667 57.35000 57.29167 57.91667 57.84167 58.25833 2 1
960 2014 3 1 65.73750 65.36667 65.27500 66.06667 66.42500 67.12500 65.78333 66.45833 66.00000 66.64167 4 1
833 2013 10 21 63.24000 63.24167 63.22500 64.36667 64.66667 66.00000 65.75000 65.75000 64.74167 65.05833 3 1

Centroids.

centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
8 9 10 11 12 13 14 15 16 17
50.36524 51.05823 51.28203 51.90283 52.51746 53.31268 53.49643 54.07300 54.04386 53.79800
59.13956 59.58633 59.90201 59.91526 60.04278 60.36267 60.52514 60.65857 60.85279 60.79912
63.71016 64.12099 64.41991 64.60124 64.64479 64.67152 64.66111 64.62758 64.63555 64.66560
66.75131 67.19017 67.42462 67.55822 67.66095 67.65834 67.57672 67.61297 67.61228 67.51312
69.76221 70.20099 70.53443 70.73323 70.74292 70.73599 70.78219 70.77175 70.63376 70.41722
ggparcoord(
  data = centroids_plot,
  columns=c(1:(ncol(centroids)-1)),
  scale="globalminmax",
  alphaLines=0.2,
  groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
  ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('C0875.h',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))

Data Preparation Parameters

variable <- "RELH"
nclusters <- 5

RELH

Vectorization

Vectorize C0875.h over the training and test partitions.

## CHANGE GRANULARITY  REP.CODE
C0875.vector.h <- station.vectorize.hour(C0875,variable)
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h[,variable] <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins. 
C0875.h$HR <- C0875.h$HR + 1
C0875.h[is.nan(C0875.h[,variable]),variable] <- NA
##END CHANGE GRANULARITY

vector <- station.vectorize(C0875.h,variable)
vector <- vector[,c(1:3,11:20)]

Cleansing

If RELH is missing any hour value, the day is discarded.

vector <- vector[complete.cases(vector),]

Discretization 5-means

Cluster with 5-means the dataset to obtain the daily profiles.

#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]

#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)

#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]

Transitions

Define the istransition variable by observing if the previous day had a different clusterid than the current.

vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
    if(vector$istransition[i-1] == vector$istransition[i])
        vector$istransition[i] <- 0 #False for C++
    else
        vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.relh <- vector

Summary

Sample 5 vectors.

kable(vector[sample(nrow(vector),5),])
YEAR MON DAY 8 9 10 11 12 13 14 15 16 17 clusterid istransition
940 2014 2 9 86.83333 81.75000 79.50000 77.50000 76.75000 75.00000 73.00000 74.66667 74.50000 74.08333 4 1
8 2011 1 8 45.91667 46.08333 46.00000 45.33333 45.09091 45.00000 45.33333 45.66667 45.91667 46.00000 2 1
611 2013 1 31 81.08333 80.36364 78.07692 75.83333 74.00000 73.09091 72.41667 73.25000 76.00000 76.58333 4 1
956 2014 2 25 88.10000 87.50000 82.75000 75.16667 72.66667 70.08333 68.16667 71.00000 71.41667 72.16667 4 1
829 2013 10 17 77.62500 76.25000 75.50000 69.25000 72.33333 70.16667 65.66667 67.91667 69.25000 68.83333 3 1

Centroids.

centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
8 9 10 11 12 13 14 15 16 17
8.675926 13.15741 15.88889 16.13889 17.30556 17.41667 16.29630 13.85185 13.33333 13.12963
69.414437 66.72594 63.07818 60.83641 59.30036 58.76455 58.89653 59.84157 61.49409 63.29273
76.780981 74.10610 71.03875 68.50534 66.92341 65.76468 65.27248 65.90217 67.35675 69.43879
81.029615 79.26991 76.89404 74.98854 73.41702 72.45610 72.15750 72.50476 73.94754 75.58106
87.538562 86.73306 85.43851 84.46789 84.07478 84.12490 83.66399 83.09817 83.32106 84.18066
ggparcoord(
  data = centroids_plot,
  columns=c(1:(ncol(centroids)-1)),
  scale="globalminmax",
  alphaLines=0.2,
  groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
  ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('C0875.h',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))

Data Preparation Parameters

variable <- "SKNT"
nclusters <- 5

SKNT

Vectorization

Vectorize C0875.h over the training and test partitions.

## CHANGE GRANULARITY  REP.CODE
C0875.vector.h <- station.vectorize.hour(C0875,variable)
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h[,variable] <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins. 
C0875.h$HR <- C0875.h$HR + 1
C0875.h[is.nan(C0875.h[,variable]),variable] <- NA
##END CHANGE GRANULARITY

vector <- station.vectorize(C0875.h,variable)
vector <- vector[,c(1:3,11:20)]

Cleansing

If SKNT is missing any hour value, the day is discarded.

vector <- vector[complete.cases(vector),]

Discretization 5-means

Cluster with 5-means the dataset to obtain the daily profiles.

#Set seed for reproducibility
set.seed(1234)
kmeans.results <- kmeans(vector[,4:ncol(vector)],nclusters)
vector$clusterid <- kmeans.results[[1]]
centroids <- kmeans.results[[2]]

#Order the centroids in crescent order of intensity and reset rows
ordered.centroids <- as.numeric(names(sort(rowSums(centroids))))
centroids <- centroids[ordered.centroids,]
rownames(centroids) <- seq(1,nclusters)

#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
vector$clusterid <- ordered.centroids[vector$clusterid]

Transitions

Define the istransition variable by observing if the previous day had a different clusterid than the current.

vector$istransition <- vector$clusterid
for (i in 2:length(vector$istransition)){
    if(vector$istransition[i-1] == vector$istransition[i])
        vector$istransition[i] <- 0 #False for C++
    else
        vector$istransition[i] <- 1 #True for C++
}
vector$istransition[1] <- -9999 # NA for C++
vector.wind <- vector

Summary

Sample 5 vectors.

kable(vector[sample(nrow(vector),5),])
YEAR MON DAY 8 9 10 11 12 13 14 15 16 17 clusterid istransition
840 2013 10 28 3.0000000 3.750000 4.500000 3.416667 3.750000 4.166667 3.545454 4.416667 7.333333 4.272727 4 1
10 2011 1 10 6.3333333 6.666667 7.666667 7.166667 6.500000 7.666667 8.083333 12.750000 21.583333 13.833333 2 1
297 2011 12 1 0.5833333 1.250000 1.416667 1.333333 1.916667 2.500000 3.166667 2.666667 3.833333 6.666667 4 1
867 2013 11 24 6.0000000 7.916667 6.250000 7.090909 6.818182 8.583333 6.416667 6.083333 5.750000 7.083333 2 1
708 2013 5 26 7.2500000 7.583333 8.833333 8.500000 9.333333 7.833333 8.000000 8.166667 8.916667 8.250000 2 1

Centroids.

centroids_plot <- data.frame(centroids)
centroids_plot$color <-rownames(centroids)
colnames(centroids_plot) <- c(colnames(centroids),"Profile")
kable(centroids)
8 9 10 11 12 13 14 15 16 17
2.515120 2.801432 3.417928 3.787969 4.227666 4.363395 4.489103 4.543058 4.612481 4.239605
6.964374 7.476228 7.725313 7.883218 8.178154 8.140686 8.075538 7.940529 7.875768 7.810506
14.266923 11.546448 11.325633 14.400040 11.718023 11.658908 11.609163 11.833090 11.463684 11.407992
149.272222 6.819444 7.194444 8.166667 8.069444 8.055556 8.277778 8.458333 7.704546 8.333333
326.800000 23.832685 8.570707 9.333333 8.135101 8.558081 8.291667 7.666667 7.722222 7.236111
ggparcoord(
  data = centroids_plot,
  columns=c(1:(ncol(centroids)-1)),
  scale="globalminmax",
  alphaLines=0.2,
  groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
  ylab("Solar Radiation Itensity (W/m2") + ggtitle(paste('C0875.h',variable,'Centroids'))# + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))

Mutual Information

Solar Cluster x Solar Cluster

Summary

Centroids

entropy <- read.table("~/Desktop/OCI/out/entropy-tsmissing.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))

++ Solar Cluster x Solar Transition

+++ Data Preparation

solar <- vector.solar[,c("YEAR","MON","DAY","istransition")]
solarc <- vector.solar[,c("YEAR","MON","DAY","clusterid")]
write.table(vector.solar[,c("clusterid","istransition")],file="~/Desktop/OCI/data/solarCid_solarTransition.csv",col.names=FALSE,row.names=FALSE,sep=",")

+++ Summary

entropy <- read.table("~/Desktop/OCI/out/entropy-solarCid_solarTransition.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))

++ Humidity Transition x Solar Transition

+++ Data Preparation

#Get the Humidity Transitions
relh <- vector.relh[,c("YEAR","MON","DAY","istransition")]

#Inner Join with the Solar Transitions
pair <- merge(relh,solar,by=c("YEAR","MON","DAY"))[,c("istransition.x","istransition.y")]
colnames(pair) <- c("relht","solart")

#Save
write.table(pair[,c("relht","solart")],file="~/Desktop/OCI/data/humidityTransition_SolarTransition.csv",col.names=FALSE,row.names=FALSE,sep=",")

+++ Summary

entropy <- read.table("~/Desktop/OCI/out/entropy-humidityTransition_SolarTransition.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))

Humidity Centroid x Solar Centroid

#Get the Humidity Transitions
relhc <- vector.relh[,c("YEAR","MON","DAY","clusterid")]

#Inner Join with the Solar and Humidity Centroids
pair <- merge(relhc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("relhc","solarc")

#Save
write.table(pair[,c("relhc","solarc")],file="~/Desktop/OCI/data/c0875_humidityCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")

Summary

entropy <- read.table("~/Desktop/OCI/out/entropy-c0875_humidityCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))

Temperature Centroid x Solar Centroid

#Get the Temperature Centroids
tmpfc <- vector.tmpfc[,c("YEAR","MON","DAY","clusterid")]

#Inner Join with the Solar and Temperature Centroids
pair <- merge(tmpfc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("tmpfc","solarc")

#Save
write.table(pair[,c("tmpfc","solarc")],file="~/Desktop/OCI/data/c0875_temperatureCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")

Summary

entropy <- read.table("~/Desktop/OCI/out/entropy-c0875_temperatureCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))

Dew Point Centroid x Solar Centroid

#Get the Humidity Transitions
dwpfc <- vector.dwpfc[,c("YEAR","MON","DAY","clusterid")]

#Inner Join with the Solar and Humidity Centroids
pair <- merge(dwpfc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("dwpfc","solarc")

#Save
write.table(pair[,c("dwpfc","solarc")],file="~/Desktop/OCI/data/c0875_dewpointCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")

Summary

entropy <- read.table("~/Desktop/OCI/out/entropy-c0875_dewpointCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))

Pressure 5-Bins x Solar Centroid

#Get the Humidity Transitions
precc <- vector.prec[,c("YEAR","MON","DAY","clusterid")]

#Inner Join with the Solar and Humidity Centroids
pair <- merge(precc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("precc","solarc")

#Save
write.table(pair[,c("precc","solarc")],file="~/Desktop/OCI/data/c0875_pressureCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")

Summary

entropy <- read.table("~/Desktop/OCI/out/entropy-c0875_pressureCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))

++ Wind Speed Transition x Solar Transition

+++ Data Preparation

#Get the wind Transitions
wind <- vector.wind[,c("YEAR","MON","DAY","istransition")]

#Inner Join with the Solar Transitions
pair <- merge(wind,solar,by=c("YEAR","MON","DAY"))[,c("istransition.x","istransition.y")]
colnames(pair) <- c("windt","solart")

#Save
write.table(pair[,c("windt","solart")],file="~/Desktop/OCI/data/windTransition_SolarTransition.csv",col.names=FALSE,row.names=FALSE,sep=",")

+++ Summary

entropy <- read.table("~/Desktop/OCI/out/entropy-windTransition_SolarTransition.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))

Wind Centroid x Solar Centroid

Data Preparation

#Get the wind Transitions
windc <- vector.wind[,c("YEAR","MON","DAY","clusterid")]

#Inner Join with the Solar Transitions
pair <- merge(windc,solarc,by=c("YEAR","MON","DAY"))[,c("clusterid.x","clusterid.y")]
colnames(pair) <- c("windc","solarc")

#Save
write.table(pair[,c("windc","solarc")],file="~/Desktop/OCI/data/c0875_windCentroid_SolarCentroid.csv",col.names=FALSE,row.names=FALSE,sep=",")

Summary

entropy <- read.table("~/Desktop/OCI/out/entropy-c0875_windCentroid_SolarCentroid.txt",header=TRUE)
ggplot(entropy, aes(x=Lag, y=Ixy)) + geom_line(colour="#94003C") + science_theme + ggtitle("Point-to-Point Mutual Information in Days") + xlab(expression(tau)) + ylab(expression(I(x[t],x[t+tau])))