Libraries and Data Pre-Processing

#Load Required Libraries
library(solarf)
library(knitr) #For pretty tables on the report using kable 
library(plyr)
library(ggplot2)
library(GGally)
library(caret)

#Theme for Plot
science_theme = theme(panel.background=element_blank(), panel.grid.major = element_line(size = 0.5, color = "grey"), 
    axis.line = element_line(size = 0.7, color = "black"), legend.position = c(0.9, 
        0.7), text = element_text(size = 14))

Load Station C0875 (Southeast Oahu Island)

C0875 <- read.csv("/Users/carlos/Google Drive/Research/SolarForecasting/datasets/mesowest/data/C0875_2011_2014.csv")

station.vectorize.hour <- function(station.dataframe,variable){
    library(reshape2)
    use.only.last.ocurrence <- function(minute.obs){
        return (minute.obs[1])
    }
    
    station.dataframe.vectorized <- dcast(data=station.dataframe, formula=YEAR + MON + DAY + HR ~ MIN,value.var = variable,fun.aggregate=use.only.last.ocurrence)
    return (station.dataframe.vectorized)
}

Vectorize Step

Vectorize C0875 over the training and test partitions.

#Moves from minute level to hourly level
C0875.vector.h <- station.vectorize.hour(C0875,"SOLR")
C0875.h <- C0875.vector.h[,c("YEAR","MON","DAY","HR")]
C0875.h$SOLR <- rowMeans(C0875.vector.h[,5:ncol(C0875.vector.h)],na.rm=TRUE) #Calculate the mean for the hour from min 0 to min 60 ; baseline is hence hour and 30 mins. 
C0875.h$HR <- C0875.h$HR + 1

C0875.vector <- station.vectorize(C0875.h,"SOLR")
C0875.vector <- C0875.vector[C0875.vector$YEAR>2011,c(1:3,11:20)] #SOLR starts on Feb 15, 2012. 

Data Point Filtering

#C0875.vector.filter <- station.vectorize(C0875[!is.na(C0875$SOLR),],"SOLR")
C0875.vector.filter <- C0875.vector[complete.cases(C0875.vector),]

Train and Test Split

#Separate the year of 2014 for prediction testing 
C0875.vector.filter.test <- C0875.vector.filter[C0875.vector.filter$YEAR==2014,]
C0875.vector.filter.train <- C0875.vector.filter[C0875.vector.filter$YEAR!=2014,]

#C0875.vector.filter.train <- C0875.vector.filter[C0875.vector.filter$MON==11 |
#                                                     C0875.vector.filter$MON==12 |
#                                                     C0875.vector.filter$MON==1 |
#                                                     C0875.vector.filter$MON==2 |
#                                                     C0875.vector.filter$MON==3 |
#                                                     C0875.vector.filter$MON==4
#                                                     ,]

Cluster Step

Cluster Train Dataset

Cluster with K-means = nclusters the training dataset and assign clusters to the daily profiles.

nclusters <- 5
set.seed(1234)
C0875.filter.train.clusters <- kmeans(C0875.vector.filter.train[,4:ncol(C0875.vector.filter)],nclusters)
C0875.vector.filter.train$clusterid <- C0875.filter.train.clusters[[1]]
C0875.vector.filter.train.centroids <- C0875.filter.train.clusters[[2]]

intensity.crescent.ordered.clusterid <- as.numeric(names(sort(rowSums(C0875.vector.filter.train.centroids))))
#Order the centroids in crescent order of ipntensity and reset rows
C0875.vector.filter.train.centroids <- C0875.vector.filter.train.centroids[intensity.crescent.ordered.clusterid,]
rownames(C0875.vector.filter.train.centroids) <- seq(1,nclusters)
#Update the centroid ids, by mapping the row ids before ordering to after ordering (element position is old ordering, value is new ordering)
C0875.vector.filter.train$clusterid <- intensity.crescent.ordered.clusterid[C0875.vector.filter.train$clusterid]

Centroids

centroids_plot <- data.frame(C0875.vector.filter.train.centroids)
centroids_plot$color <-rownames(C0875.vector.filter.train.centroids)
colnames(centroids_plot) <- c(colnames(C0875.vector.filter.train.centroids),"Profile")
kable(C0875.vector.filter.train.centroids)
8 9 10 11 12 13 14 15 16 17
63.75098 169.7902 273.9124 292.8852 312.6332 334.7674 341.3711 292.0897 236.2965 144.7556
85.89587 248.3484 427.8448 571.5542 629.9225 636.4579 573.0717 473.9575 343.4200 200.5349
95.86391 251.6029 441.4603 599.4685 742.0340 820.2648 836.0977 750.7302 556.1624 328.4749
196.82671 388.6677 599.1102 755.9045 814.2504 825.1088 758.6555 605.5889 455.8949 266.5185
207.56872 417.2371 629.2724 825.2460 954.5118 981.7954 940.3175 863.0599 680.0826 452.6717

Centroids

ggparcoord(
  data = centroids_plot,
  columns=c(1:ncol(C0875.vector.filter.train.centroids)),
  scale="globalminmax",
  alphaLines=0.2,
  groupColumn=ncol(centroids_plot)
) + geom_line(size=2,alpha=0.6) + science_theme + xlab("Hour of the Day") +
  ylab("Solar Radiation Itensity (W/m2") + ggtitle("Solar Radiation Discretized Profiles from 2003 to 2013 of Schofield Barracks Station") + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))

#+ scale_fill_discrete(name="Discretized\ Profile", breaks=c("1", "2", "3","4","5"), labels=c("1 (Overcast)", "2","3","4","5"))# + scale_colour_brewer(palette="Spectral")

Cluster Test Dataset (Ground Truth)

Results.groundtruth <- station.assign.daily.cluster(C0875.vector.filter.test,C0875.vector.filter.train.centroids)

Base Models Creation

Model 1 (J=2)

jointsize <- 2 

Create Argmax Model

#Argmax model creation
C0875.vector.filter.train <- station.add.missing.profiles(C0875.vector.filter.train)
## Warning: package 'zoo' was built under R version 3.1.3
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
C0875.filter.train.joint <- station.joint.probability(C0875.vector.filter.train$clusterid,jointsize)
C0875.argmax.model <- model.argmax(C0875.filter.train.joint)
## hash-2.2.6 provided by Decision Patterns

Perform Prediction

#Prediction using the model
C0875.vector.filter.pred <-station.predict.daily.cluster(station.vector.clustered=Results.groundtruth,model=C0875.argmax.model)

All probabilities for each Prediction

#All probabilities using the model
C0875.filter.train.conditional <- station.conditional.probability(C0875.filter.train.joint)
C0875.vector.filter.All.Probs <- station.clusterid.probability(station.conditional.probability=C0875.filter.train.conditional,clusterid.vector = Results.groundtruth$clusterid)

M1 Outcome Dataframe (Prediction|Support|AllProbs)

Results.m1<-data.frame(
  Pred = C0875.vector.filter.pred$Daym0,
  Support = C0875.vector.filter.pred$Freq,
  All.Probs = C0875.vector.filter.All.Probs,
  Hist = C0875.vector.filter.pred$Hist
)
kable(head(Results.m1))
Pred Support All.Probs Hist
NA NA NA NA
2 56 0.125 0.5 0.160714285714286 0.125 0.0892857142857143 2
2 29 0.206896551724138 0.551724137931034 0.137931034482759 0.0344827586206897 0.0689655172413793 1
3 48 0.0416666666666667 0.291666666666667 0.333333333333333 0.166666666666667 0.166666666666667 3
3 48 0.0416666666666667 0.291666666666667 0.333333333333333 0.166666666666667 0.166666666666667 3
3 48 0.0416666666666667 0.291666666666667 0.333333333333333 0.166666666666667 0.166666666666667 3

Model 2 (J=3)

jointsize <- 3 

Create Argmax Model

#Argmax model creation
C0875.filter.train.joint <- station.joint.probability(C0875.vector.filter.train$clusterid,jointsize)
C0875.argmax.model <- model.argmax(C0875.filter.train.joint)

Perform Prediction

#Prediction using the model
C0875.vector.filter.pred <-station.predict.daily.cluster(station.vector.clustered=Results.groundtruth,model=C0875.argmax.model)

All probabilities for each Prediction

#All probabilities using the model
C0875.filter.train.conditional <- station.conditional.probability(C0875.filter.train.joint)
C0875.vector.filter.All.Probs <- station.clusterid.probability(station.conditional.probability=C0875.filter.train.conditional,clusterid.vector = Results.groundtruth$clusterid)

M2 Outcome Dataframe (Prediction|Support|AllProbs)

Results.m2<-data.frame(
  Pred = C0875.vector.filter.pred$Daym0,
  Support = C0875.vector.filter.pred$Freq,
  All.Probs = C0875.vector.filter.All.Probs,
  Hist = C0875.vector.filter.pred$Hist
)
kable(head(Results.m2))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
2 7 0.142857142857143 0.571428571428571 0.142857142857143 0.142857142857143 2 1
2 4 0.75 0.25 1 3
3 14 0.357142857142857 0.5 0.0714285714285714 0.0714285714285714 3 3
3 14 0.357142857142857 0.5 0.0714285714285714 0.0714285714285714 3 3

Model 3 (J=4)

kable(head(Results.m3))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
2 1 1 2 1 3
NA 0 NA 1 3 3
3 7 0.428571428571429 0.571428571428571 3 3 3

Model 4 (J=5)

kable(head(Results.m4))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA 0 NA 2 1 3 3
NA 0 NA 1 3 3 3

Model 5 (J=6)

kable(head(Results.m5))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA 0 NA 2 1 3 3 3

Model 6 (J=7)

kable(head(Results.m6))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA

Model 7 (J=8)

kable(head(Results.m7))
Pred Support All.Probs Hist
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA
NA NA NA NA

Entropy Arbtitrer

Minimum entropy selection criteria

Results.decision <- min.entropy.arbitrer(data.frame(
  Results.m1$All.Probs,
  Results.m2$All.Probs,
  Results.m3$All.Probs,  
  Results.m4$All.Probs,  
  Results.m5$All.Probs,
  Results.m6$All.Probs,
  Results.m7$All.Probs  
  ),
  data.frame(
  Results.m1$Support,
  Results.m2$Support,
  Results.m3$Support,  
  Results.m4$Support,  
  Results.m5$Support,
  Results.m6$Support,
  Results.m7$Support
  )
)
Results.decision.no.support <- min.entropy.arbitrer(data.frame(
  Results.m1$All.Probs,
  Results.m2$All.Probs,
  Results.m3$All.Probs,  
  Results.m4$All.Probs,  
  Results.m5$All.Probs,
  Results.m6$All.Probs,
  Results.m7$All.Probs  
  ))

Error Calculation

Ensemble Prediction

Results <- data.frame(
  groundtruth = Results.groundtruth,
  m1 = Results.m1,
  m2 = Results.m2,
  m3 = Results.m3,  
  m4 = Results.m4,  
  m5 = Results.m5,
  m6 = Results.m6,  
  m7 = Results.m7,    
  decision = Results.decision, #Returns Inf if not a single entropy is available due to lack of hist. 
  decision.nosupport = Results.decision.no.support 
)

Output the final predictions using the arbitrer WITH support

Ensemble.Prediction <- data.frame(
  m1.pred = Results.m1$Pred,
  m2.pred = Results.m2$Pred,
  m3.pred = Results.m3$Pred,  
  m4.pred = Results.m4$Pred,  
  m5.pred = Results.m5$Pred,
  m6.pred = Results.m6$Pred,
  m7.pred = Results.m7$Pred,  
  decision = Results.decision$Index
)
Results$arbitrerPred <- Ensemble.Prediction[cbind(1:nrow(Ensemble.Prediction), Ensemble.Prediction$decision)]

Output the final predictions using the arbitrer WITHOUT support

Ensemble.Prediction.no.support <- data.frame(
  m1.pred = Results.m1$Pred,
  m2.pred = Results.m2$Pred,
  m3.pred = Results.m3$Pred,  
  m4.pred = Results.m4$Pred,  
  m5.pred = Results.m5$Pred,
  m6.pred = Results.m6$Pred,
  m7.pred = Results.m7$Pred,  
  decision = Results.decision.no.support$Index
)
Results$arbitrerPredNoSupport <- Ensemble.Prediction.no.support[cbind(1:nrow(Ensemble.Prediction.no.support), Ensemble.Prediction.no.support$decision)]

NA to Most Frequent Centroid

a <- summary(as.factor(C0875.vector.filter.train$clusterid[!is.na(C0875.vector.filter.train$clusterid)]))
base.centroid <- which(a == max(a))
Ensemble.Prediction$m1.pred[is.na(Ensemble.Prediction$m1.pred)] <- base.centroid
Ensemble.Prediction$m2.pred[is.na(Ensemble.Prediction$m2.pred)] <- base.centroid
Ensemble.Prediction$m3.pred[is.na(Ensemble.Prediction$m3.pred)] <- base.centroid
Ensemble.Prediction$m4.pred[is.na(Ensemble.Prediction$m4.pred)] <- base.centroid
Ensemble.Prediction$m5.pred[is.na(Ensemble.Prediction$m5.pred)] <- base.centroid
Ensemble.Prediction$m6.pred[is.na(Ensemble.Prediction$m6.pred)] <- base.centroid
## Warning in `[<-.factor`(`*tmp*`, is.na(Ensemble.Prediction$m6.pred), value
## = structure(c(NA, : invalid factor level, NA generated
Ensemble.Prediction$m7.pred[is.na(Ensemble.Prediction$m7.pred)] <- base.centroid
## Warning in `[<-.factor`(`*tmp*`, is.na(Ensemble.Prediction$m7.pred), value
## = structure(c(NA, : invalid factor level, NA generated
Results$arbitrerPred[is.na(Results$arbitrerPred)] <- base.centroid
Results$arbitrerPredNoSupport[is.na(Results$arbitrerPredNoSupport)] <- base.centroid

Individual Errors

Base Model 1 Error

m1.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m1.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m1.error)

mean(m1.error,na.rm=TRUE)
## [1] 1821.048
sum(m1.error,na.rm=TRUE)/(10*length(m1.error[!is.na(m1.error)]))
## [1] 182.1048

Base Model 2 Error

m2.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m2.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m2.error)

mean(m2.error,na.rm=TRUE)
## [1] 1749.29
sum(m2.error,na.rm=TRUE)/(10*length(m2.error[!is.na(m2.error)]))
## [1] 174.929

Base Model 3 Error

m3.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m3.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m3.error)

mean(m3.error,na.rm=TRUE)
## [1] 1523.576
sum(m3.error,na.rm=TRUE)/(10*length(m3.error[!is.na(m3.error)]))
## [1] 152.3576

Base Model 4 Error

m4.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m4.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m4.error)

mean(m4.error,na.rm=TRUE)
## [1] 1534.698
sum(m4.error,na.rm=TRUE)/(10*length(m4.error[!is.na(m4.error)]))
## [1] 153.4698

Base Model 5 Error

m5.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m5.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m5.error)

mean(m5.error,na.rm=TRUE)
## [1] 2642.861
sum(m5.error,na.rm=TRUE)/(10*length(m5.error[!is.na(m5.error)]))
## [1] 264.2861

Base Model 6 Error

m6.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m6.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m6.error)

mean(m6.error,na.rm=TRUE)
## [1] 2659.16
sum(m6.error,na.rm=TRUE)/(10*length(m6.error[!is.na(m6.error)]))
## [1] 265.916

Base Model 7 Error

m7.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Ensemble.Prediction$m7.pred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(m7.error)

mean(m7.error,na.rm=TRUE)
## [1] 3566.394
sum(m7.error,na.rm=TRUE)/(10*length(m7.error[!is.na(m7.error)]))
## [1] 356.6394

Error From Ensemble Prediction WITHOUT support

ensemble.error.no.support <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Results$arbitrerPredNoSupport,cluster.centroids=C0875.vector.filter.train.centroids)

hist(ensemble.error.no.support)

mean(ensemble.error.no.support,na.rm=TRUE)
## [1] 1520.587
sum(ensemble.error.no.support,na.rm=TRUE)/(10*length(ensemble.error.no.support[!is.na(ensemble.error.no.support)]))
## [1] 152.0587

Error From Ensemble Prediction WITH support

ensemble.error <- error.absolute.centroid(station.vectorized.pred=C0875.vector.filter.test[,4:ncol(C0875.vector.filter.test)],pred.clusters=Results$arbitrerPred,cluster.centroids=C0875.vector.filter.train.centroids)

hist(ensemble.error)

mean(ensemble.error,na.rm=TRUE)
## [1] 1546.307
sum(ensemble.error,na.rm=TRUE)/(10*length(ensemble.error[!is.na(ensemble.error)]))
## [1] 154.6307

Arbiter Base Model Selection Count

Support & Entropy

How much from each model is the ensemble using WITH the support weight?

qplot(decision, data=Ensemble.Prediction, geom="histogram",ylab="Frequency",xlab="Model") + scale_x_discrete(limits=c("B1","B2","B3","B4","B5","B6","B7")) + science_theme

Entropy

qplot(decision, data=Ensemble.Prediction.no.support, geom="histogram",ylab="Frequency",xlab="Model") + scale_x_discrete(limits=c("B1","B2","B3","B4","B5","B6","B7")) + science_theme

It is noticiable that model B5 is not chosen as much anymore, and B1 now is chosen less than B2. B3 and B4 are now chosen much more for the prediction. This effect gives hopes that now the ensemble will do better, given it is choosen the model with the least amount of error more (B3).

Daily Mean Absolute Error per Model

mean_error <- data.frame(error=c(
as.integer(mean(m1.error,na.rm=TRUE)),
as.integer(mean(m2.error,na.rm=TRUE)),
as.integer(mean(m3.error,na.rm=TRUE)),
as.integer(mean(m4.error,na.rm=TRUE)),
as.integer(mean(m5.error,na.rm=TRUE)),
as.integer(mean(m6.error,na.rm=TRUE)),
as.integer(mean(m7.error,na.rm=TRUE)),
as.integer(mean(ensemble.error.no.support,na.rm=TRUE)),
as.integer(mean(ensemble.error,na.rm=TRUE))
))
labels <- c("B1","B2","B3","B4","B5","B6","B7","E","ESpt")

qplot(labels,mean_error$error, geom="bar",stat="identity",xlab="Model",ylab="Daily Mean Absolute Error") + science_theme

rownames(mean_error) <- labels
kable(mean_error)
error
B1 1821
B2 1749
B3 1523
B4 1534
B5 2642
B6 2659
B7 3566
E 1520
ESpt 1546

Daily Hourly Absolute Error per Model (HECO)

mean_error <- data.frame(error=c(
as.integer(sum(m1.error,na.rm=TRUE)/(10*length(m1.error[!is.na(m1.error)]))),
as.integer(sum(m2.error,na.rm=TRUE)/(10*length(m2.error[!is.na(m2.error)]))),
as.integer(sum(m3.error,na.rm=TRUE)/(10*length(m3.error[!is.na(m3.error)]))),
as.integer(sum(m4.error,na.rm=TRUE)/(10*length(m4.error[!is.na(m4.error)]))),
as.integer(sum(m5.error,na.rm=TRUE)/(10*length(m5.error[!is.na(m5.error)]))),
as.integer(sum(m6.error,na.rm=TRUE)/(10*length(m6.error[!is.na(m6.error)]))),
as.integer(sum(m7.error,na.rm=TRUE)/(10*length(m7.error[!is.na(m7.error)]))),
as.integer(sum(ensemble.error.no.support,na.rm=TRUE)/(10*length(ensemble.error.no.support[!is.na(ensemble.error.no.support)]))),
as.integer(sum(ensemble.error,na.rm=TRUE)/(10*length(ensemble.error[!is.na(ensemble.error)])))
))
labels <- c("m1","m2","m3","m4","m5","m6","m7","M","MSpt")
qplot(labels,mean_error$error, geom="bar",stat="identity",xlab="Model",ylab="Hourly Mean Absolute Error") + science_theme + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))

rownames(mean_error) <- labels
kable(mean_error)
error
m1 182
m2 174
m3 152
m4 153
m5 264
m6 265
m7 356
M 152
MSpt 154

We can see that the use of the support value improves compared to just using the minimum entropy value in order to decise which base model to predict.

Daily Missing Prediction per Model

missing <- data.frame(missing=c(
round(length(Ensemble.Prediction$m1.pred[is.na(Ensemble.Prediction$m1.pred)])/345,3)*100,
round(length(Ensemble.Prediction$m2.pred[is.na(Ensemble.Prediction$m2.pred)])/345,3)*100,
round(length(Ensemble.Prediction$m3.pred[is.na(Ensemble.Prediction$m3.pred)])/345,3)*100,
round(length(Ensemble.Prediction$m4.pred[is.na(Ensemble.Prediction$m4.pred)])/345,3)*100,
round(length(Ensemble.Prediction$m5.pred[is.na(Ensemble.Prediction$m5.pred)])/345,3)*100,
round(length(Ensemble.Prediction$m6.pred[is.na(Ensemble.Prediction$m6.pred)])/345,3)*100,
round(length(Ensemble.Prediction$m7.pred[is.na(Ensemble.Prediction$m7.pred)])/345,3)*100,
round(length(Results$arbitrerPred[is.na(Results$arbitrerPred)])/345,3)*100,
round(length(Results$arbitrerPredNoSupport[is.na(Results$arbitrerPredNoSupport)])/345,3)*100
))
labels <- c("m1","m2","m3","m4","m5","m6","m7","M","MSpt")

qplot(labels,missing$missing, geom="bar",stat="identity",xlab="Model",ylab=" % Daily Missing Predictions")  + science_theme  + theme(axis.text.x = element_text(hjust = 1, size=20,color="black"), axis.text.y = element_text(hjust = 1, size=20,color="black"), plot.title = element_text(size=20), axis.title.y=element_text(size=20), axis.title.x=element_text(size=20))