# Load the packages.
library(dplyr)
library(tidyr)
library(hector)
library(hectorcal)
library(ggplot2)
library(cluster) # clustering algorithms
library(factoextra)
# Set up the directories.
PROJECT_DIR <- "/Users/dorh012/Documents/2019/hectorcal"
INPUT_DIR <- file.path(PROJECT_DIR, 'analysis', 'best_fit', 'final_conc', 'ensemble')
# Define Functions
# Set up Hector core with the best fit paramter values
#
# Args
# input: a dataframe of 1 nrow that contains values for Hector parameters and model name
# Return: a datat frame containing the Hector results.
param_run_Hector <- function(input){
# Select the paramter values from the input data frame.
params <- as.numeric(input[names(input) %in% c(ECS(), DIFFUSIVITY(), VOLCANIC_SCALE(), AERO_SCALE())])
names(params) <- names(input)[names(input) %in% c(ECS(), DIFFUSIVITY(), VOLCANIC_SCALE(), AERO_SCALE())]
# Parmeterize the hector cores.
lapply(hector_cores, parameterize_core, params = params)
# Run Hector
lapply(hector_cores, reset)
lapply(hector_cores, run, runtodate = 2100)
bind_rows(lapply(hector_cores, function(c){
tryCatch({
parameterize_core(params = params, core = c)
reset(core = c)
run(core = c, runtodate = 2100)
cbind(method = input[['method']],
cluster = input[['cluster']],
fetchvars(core = c, dates = 1850:2100, vars = c(GLOBAL_TEMP(), HEAT_FLUX())))
}, error = function(e){NULL})
}))
}
# Import the hector output and the best fit parameters.
best_fits <- readRDS(file.path(INPUT_DIR, 'best_fit_params.rds'))
# The Hector temp and heatflux data from Hector runs using the best fit parameters.
best_fit_output <- readRDS(file.path(INPUT_DIR, 'best_fit_hector_output.rds')) %>%
rename(experiment = scenario) %>%
mutate(experiment = if_else(grepl(pattern = 'rcp', x = experiment) & year >= 2005, experiment, 'historical')) %>%
distinct() %>%
left_join(best_fits %>%
select(model, method), by = 'model')
What happens if we categorize models by their end of century heat flux value?
How does EOC heat flux look like?
esm_comparison %>%
filter(variable == 'heatflux' & !grepl('esm', experiment)) %>%
filter(year == 2100) %>%
group_by(experiment) %>%
summarise(cmip_min = mean(mina),
cmip_max = mean(maxb)) %>%
ungroup %>%
gather(stat, value, cmip_min, cmip_max) %>%
mutate(height = 1) ->
esm_bounds
# What does the 2090 to 2100 mean heat flux and temp look like??
best_fit_output %>%
filter(year == 2100 ) %>%
filter(experiment != 'historical' & variable == 'heatflux') %>%
group_by(model, experiment, variable, method) %>%
summarise(value = mean(value)) %>%
ungroup %>%
mutate(height = 1) %>%
ggplot() +
geom_vline(data = esm_bounds, aes(xintercept = value, color = stat)) +
geom_histogram(aes(value)) +
#geom_histogram(aes(value), bins = 40) +
#geom_dotplot(data = esm_bounds, aes(value, fill = stat, color = stat)) +
facet_grid(variable ~ experiment) +
labs(title = 'EOC heatflux',
y = 'count',
x = 'heat flux W/m^2')

The vertical lines reprsent the esm or cmip min and max range values. There are rcp60 and rcp85 EOC heatflux values that exceede the cmip range the distribtuion of the heat flux is broken up in rcp60 and rcp 85 but it is not obvious if that is clusetering or because there is small sample size of models used to make the histograms (30 models or observations).
So what happens if we categorize the models into where they fall within the IQR? Since there are a fair amount of models that have heatflux that is somewhere close to the mean.
esm_comparison %>%
filter(variable == 'heatflux' & !grepl('esm', experiment)) %>%
filter(year == 2100) %>%
mutate(Q0 = mina,
Q100 = maxb,
Q50 = (Q0 + Q100) / 2,
Q25 = (Q0 + Q50) / 2,
Q75 = (Q50 + Q100) / 2) %>%
select(year, variable, experiment, Q0, Q25, Q50, Q75, Q100) ->
quantile_df
best_fit_output %>%
filter(year == 2100 ) %>%
filter(experiment != 'historical' & variable == 'heatflux') %>%
inner_join(quantile_df, by = c('experiment', 'year', 'variable')) %>%
mutate(category = '1st quant') %>%
mutate(category = if_else(value >= Q25, '2nd quant', category)) %>%
mutate(category = if_else(value >= Q50, '3nd quant', category)) %>%
mutate(category = if_else(value >= Q75, '4th quant', category)) %>%
select(model, category, experiment) ->
heat_flux_categorized
xmid <- 0.5*(min(best_fits$S) + max(best_fits$S))
xrng <- 1.5*(max(best_fits$S) - min(best_fits$S))
xlo <- xmid - 0.5*xrng
xhi <- xmid + 0.5*xrng
best_fits %>%
left_join(heat_flux_categorized) %>%
ggplot() +
geom_label(aes(S, diff, label = model, fill = category)) +
labs(title = 'EOC heatflux') +
facet_wrap('experiment') +
theme(legend.position = 'bottom') +
theme_bw(base_size = 14) +
ggplot2::xlim(c(xlo,xhi))

NA
There is not an obvious or consistent pattern here, but it is interesting that as the rcp increases that the more of the models fall furhter away from the mean, there are less models in the 2nd and 3rd qunatile ranges.
What happens if we do the PCA by the 4 parmeters?
# Store the in a data frame where the rows are named by the model
df <- data.frame(S = best_fits$S, diff = best_fits$diff, row.names = best_fits$model,
alpha = best_fits$alpha, volscl = best_fits$volscl, stringsAsFactors = FALSE)
k_4params <- kmeans(df, centers = 5, nstart = 25)
fviz_cluster(k_4params, data = df, axes = c(1, 2))

But now plot diff and S on the axses grouped by cluster.
cluster_rslts <- data.frame(model = names(k_4params$cluster),
cluster = as.character(k_4params$cluster))
best_fits %>%
left_join(cluster_rslts) %>%
ggplot(aes(S, diff, label = model, fill = cluster)) +
geom_label() +
theme_bw(base_size = 14) +
ggplot2::xlim(c(xlo,xhi)) +
labs(title = 'diff vs S grouped by cluster')
Joining, by = "model"
Column `model` joining character vector and factor, coercing into character vector

What happens if we do the PCA by the 4 parmeters + EOC heat flux and temp values from rcp85?
best_fit_output %>%
filter(variable %in% c('heatflux', 'Tgav') & year == 2100 & experiment == 'rcp85') %>%
group_by(model, variable) %>%
summarise(value = mean(value)) %>%
ungroup %>%
spread(variable, value) %>%
select(model, heatflux, Tgav) ->
EOC_hf
# Store the in a data frame where the rows are named by the model
df <- data.frame(S = best_fits$S, diff = best_fits$diff, row.names = best_fits$model,
alpha = best_fits$alpha, volscl = best_fits$volscl, model = best_fits$model, stringsAsFactors = FALSE) %>%
left_join(EOC_hf) %>%
select(-model)
Joining, by = "model"
df <- as.data.frame(df, row.names = best_fits$model)
k_4params_eocHF <- kmeans(df, centers = 5, nstart = 25)
fviz_cluster(k_4params_eocHF, data = df, axes = c(1, 2))

cluster_rslts2 <- data.frame(model = names(k_4params_eocHF$cluster),
cluster = as.character(k_4params_eocHF$cluster))
best_fits %>%
left_join(cluster_rslts2) %>%
ggplot(aes(S, diff, label = model, fill = cluster)) +
geom_label() +
theme_bw(base_size = 14) +
ggplot2::xlim(c(xlo,xhi)) +
labs(title = 'diff vs S grouped by cluster')
Joining, by = "model"
Column `model` joining character vector and factor, coercing into character vector

Well it looks like the models are broken up into the same groups again but when we visualize it the groups are more obvious. I wonder if what happens when we run Hector with the different centriod values?
Hector with kmeans centriod params
tibble::as_tibble(k_4params_eocHF$centers) %>%
mutate(method = 'params + EOC ',
cluster = 1:nrow(k_4params_eocHF$centers)) %>%
bind_rows(tibble::as_tibble(k_4params$centers) %>%
mutate(method = 'params') %>%
mutate(cluster1 = 1:5) %>%
# Rename the clusters so that they are comparable to the cluster results from the other kmean method
mutate(cluster = if_else(cluster1 == 2, 5, as.double(cluster1)),
cluster = if_else(cluster1 == 4, 1, as.double(cluster)),
cluster = if_else(cluster1 == 1, 4, as.double(cluster)),
cluster = if_else(cluster1 == 5, 2, as.double(cluster)))) ->
center_df
# Create a list of the hector cores that we are going to want to run.
hector_cores <- list(newcore(system.file('input/hector_rcp85_constrained.ini', package = 'hector'), name = 'rcp85'))
apply(X = center_df, MARGIN = 1, FUN = param_run_Hector) %>%
bind_rows() ->
central_output
How much do the central points or the paramters change for the clusters based on the differnt methods?
center_df %>%
select(S, diff, alpha, volscl, method, cluster) %>%
gather(param, value, S, diff, alpha, volscl) %>%
ggplot(aes(value, cluster, color = method)) +
geom_point(size = 2, alpha = 0.7) +
facet_wrap('param', scales = 'free') +
labs(title = 'center param values based on the two k methods')

whelp it looks like including the EOC temp and heat flux data did not change were the kmeans centers were so it probably will not impact the Hector output either.
ggplot(data = central_output %>%
filter(scenario =='rcp85') %>%
mutate(metho = factor(method,levels = c('params', 'params + EOC')))) +
geom_line(aes(year, value, color = method, linetype = method, group = interaction(method, cluster, scenario)),
size = 0.75) +
facet_grid(variable ~ cluster, scales = 'free_y')

Wrap Up
Well I offically have a case of the Friday brain mush and I am going to head out. I need to still look at the results by grouping the models the physics information, when I was running the calibrations earlier I was having issues getting the runs to converge :( and it looks like when we break up the models by physics we quickly run out of our data requirements (hist tas + future tas)
---
title: "Kmeans S vs diff investigation"
output: html_notebook
---

```{r, warning = FALSE, message = FALSE}
# Load the packages.
library(dplyr)
library(tidyr)
library(hector)
library(hectorcal)
library(ggplot2)
library(cluster)    # clustering algorithms
library(factoextra)


# Set up the directories.
PROJECT_DIR <- "/Users/dorh012/Documents/2019/hectorcal"
INPUT_DIR   <- file.path(PROJECT_DIR, 'analysis', 'best_fit', 'final_conc', 'ensemble')


# Define Functions 
# Set up Hector core with the best fit paramter values
#
# Args
#   input: a dataframe of 1 nrow that contains values for Hector parameters and model name
# Return: a datat frame containing the Hector results.
param_run_Hector <- function(input){

    # Select the paramter values from the input data frame.
    params <- as.numeric(input[names(input) %in% c(ECS(), DIFFUSIVITY(), VOLCANIC_SCALE(), AERO_SCALE())])
    names(params) <- names(input)[names(input) %in% c(ECS(), DIFFUSIVITY(), VOLCANIC_SCALE(), AERO_SCALE())]

    # Parmeterize the hector cores.
    lapply(hector_cores, parameterize_core, params = params)

    # Run Hector
    lapply(hector_cores, reset)
    lapply(hector_cores, run, runtodate = 2100)

    bind_rows(lapply(hector_cores, function(c){

        tryCatch({
            parameterize_core(params = params, core = c)
            reset(core = c)
            run(core = c, runtodate = 2100)
            cbind(method = input[['method']],
                  cluster = input[['cluster']],
                  fetchvars(core = c, dates = 1850:2100, vars = c(GLOBAL_TEMP(), HEAT_FLUX())))
            }, error = function(e){NULL})


    }))


}

```


```{r}
# Import the hector output and the best fit parameters. 
best_fits       <- readRDS(file.path(INPUT_DIR, 'best_fit_params.rds')) 
# The Hector temp and heatflux data from Hector runs using the best fit parameters.
best_fit_output <- readRDS(file.path(INPUT_DIR, 'best_fit_hector_output.rds')) %>%  
    rename(experiment = scenario) %>% 
    mutate(experiment = if_else(grepl(pattern = 'rcp', x = experiment) & year >= 2005, experiment, 'historical')) %>% 
    distinct() %>% 
    left_join(best_fits %>% 
                  select(model, method), by = 'model')
```


### What happens if we categorize models by their end of century heat flux value?  

How does EOC heat flux look like? 

```{r, fig.width=8, fig.height=4, warning = FALSE, message = FALSE}

esm_comparison %>% 
    filter(variable == 'heatflux' & !grepl('esm', experiment)) %>% 
    filter(year == 2100) %>%  
    group_by(experiment) %>% 
    summarise(cmip_min = mean(mina), 
              cmip_max = mean(maxb)) %>% 
    ungroup %>% 
    gather(stat, value, cmip_min, cmip_max) %>% 
    mutate(height = 1) -> 
    esm_bounds


# What does the 2090 to 2100 mean heat flux and temp look like?? 
best_fit_output %>% 
    filter(year == 2100 ) %>% 
    filter(experiment != 'historical' & variable == 'heatflux') %>% 
    group_by(model, experiment, variable, method) %>% 
    summarise(value = mean(value)) %>% 
    ungroup %>% 
    mutate(height = 1) %>% 
    ggplot() + 
            geom_vline(data = esm_bounds, aes(xintercept = value, color = stat)) +
    geom_histogram(aes(value)) +
    #geom_histogram(aes(value), bins = 40) + 
    #geom_dotplot(data = esm_bounds, aes(value, fill = stat, color = stat)) + 

    facet_grid(variable ~ experiment) + 
    labs(title = 'EOC heatflux', 
         y = 'count', 
         x = 'heat flux W/m^2') 
```


The vertical lines reprsent the esm or cmip min and max range values. There are rcp60 and rcp85 EOC heatflux values that exceede the cmip range the distribtuion of the heat flux is broken up in rcp60 and rcp 85 but it is not obvious if that is clusetering or because there is small sample size of models used to make the histograms (30 models or observations). 


<br>


So what happens if we categorize the models into where they fall within the IQR? Since there are a fair amount of models that have heatflux that is somewhere close to the mean. 


```{r, fig.width=16, fig.height=9, warning = FALSE, message = FALSE}

esm_comparison %>% 
filter(variable == 'heatflux' & !grepl('esm', experiment)) %>%  
filter(year == 2100) %>% 
        mutate(Q0 = mina, 
               Q100 = maxb,
               Q50 = (Q0 + Q100) / 2, 
               Q25 = (Q0 + Q50) / 2, 
               Q75 = (Q50 + Q100) / 2) %>% 
        select(year, variable, experiment, Q0, Q25, Q50, Q75, Q100) -> 
        quantile_df
    

best_fit_output %>% 
    filter(year == 2100 ) %>% 
    filter(experiment != 'historical' & variable == 'heatflux') %>% 
        inner_join(quantile_df, by = c('experiment', 'year', 'variable')) %>% 
        mutate(category = '1st quant') %>% 
        mutate(category = if_else(value >= Q25, '2nd quant', category)) %>% 
        mutate(category = if_else(value >= Q50, '3nd quant', category)) %>% 
        mutate(category = if_else(value >= Q75, '4th quant', category)) %>% 
        select(model, category, experiment) -> 
        heat_flux_categorized

xmid <- 0.5*(min(best_fits$S) + max(best_fits$S))
xrng <- 1.5*(max(best_fits$S) - min(best_fits$S))
xlo <- xmid - 0.5*xrng
xhi <- xmid + 0.5*xrng

best_fits %>%  
    left_join(heat_flux_categorized) %>% 
    ggplot() + 
    geom_label(aes(S, diff, label = model, fill = category)) + 
    labs(title = 'EOC heatflux') + 
    facet_wrap('experiment') + 
    theme(legend.position = 'bottom') + 
    theme_bw(base_size = 14) + 
    ggplot2::xlim(c(xlo,xhi))
        
```


There is not an obvious or consistent pattern here, but it is interesting that as the rcp increases that the more of the models fall furhter away from the mean, there are less models in the 2nd and 3rd qunatile ranges. 



### What happens if we do the PCA by the 4 parmeters? 

```{r, warning = FALSE, message = FALSE}
# Store the in a data frame where the rows are named by the model 
df <- data.frame(S = best_fits$S, diff = best_fits$diff, row.names = best_fits$model, 
                     alpha = best_fits$alpha, volscl = best_fits$volscl, stringsAsFactors = FALSE)
k_4params <- kmeans(df, centers = 5, nstart = 25)
fviz_cluster(k_4params, data = df, axes = c(1, 2))
```

But now plot diff and S on the axses grouped by cluster. 

```{r}
cluster_rslts <-  data.frame(model = names(k_4params$cluster), 
          cluster = as.character(k_4params$cluster))
    

best_fits %>% 
    left_join(cluster_rslts) %>% 
    ggplot(aes(S, diff, label = model, fill = cluster)) + 
    geom_label() + 
    theme_bw(base_size = 14) + 
    ggplot2::xlim(c(xlo,xhi)) + 
    labs(title = 'diff vs S grouped by cluster')

```

### What happens if we do the PCA by the 4 parmeters + EOC heat flux and temp values from rcp85?


```{r}
best_fit_output %>% 
    filter(variable %in% c('heatflux', 'Tgav') & year == 2100 & experiment == 'rcp85') %>% 
    group_by(model, variable) %>% 
    summarise(value = mean(value)) %>% 
    ungroup %>% 
    spread(variable, value) %>% 
    select(model, heatflux, Tgav) -> 
    EOC_hf

# Store the in a data frame where the rows are named by the model 
df <- data.frame(S = best_fits$S, diff = best_fits$diff, row.names = best_fits$model, 
                     alpha = best_fits$alpha, volscl = best_fits$volscl, model = best_fits$model, stringsAsFactors = FALSE) %>% 
    left_join(EOC_hf) %>%  
    select(-model)
    
  df <- as.data.frame(df, row.names = best_fits$model)  
    
    
k_4params_eocHF <- kmeans(df, centers = 5, nstart = 25)
fviz_cluster(k_4params_eocHF, data = df, axes = c(1, 2))
```


```{r}
cluster_rslts2 <-  data.frame(model = names(k_4params_eocHF$cluster), 
          cluster = as.character(k_4params_eocHF$cluster))
    

best_fits %>% 
    left_join(cluster_rslts2) %>% 
    ggplot(aes(S, diff, label = model, fill = cluster)) + 
    geom_label() + 
    theme_bw(base_size = 14) + 
    ggplot2::xlim(c(xlo,xhi)) + 
    labs(title = 'diff vs S grouped by cluster')

```



Well it looks like the models are broken up into the same groups again but when we visualize it the groups are more obvious. I wonder if what happens when we run Hector with the different centriod values? 

## Hector with kmeans centriod params


```{r, warning = FALSE, messsage = FALSE}

tibble::as_tibble(k_4params_eocHF$centers) %>%
    mutate(method = 'params + EOC ', 
           cluster = 1:nrow(k_4params_eocHF$centers)) %>% 
    bind_rows(tibble::as_tibble(k_4params$centers) %>%
                  mutate(method = 'params') %>% 
                  mutate(cluster1 = 1:5) %>% 
                  # Rename the clusters so that they are comparable to the cluster results from the other kmean method
                  mutate(cluster = if_else(cluster1 == 2, 5, as.double(cluster1)), 
                         cluster = if_else(cluster1 == 4, 1, as.double(cluster)), 
                         cluster = if_else(cluster1 == 1, 4, as.double(cluster)), 
                         cluster = if_else(cluster1 == 5, 2, as.double(cluster)))) -> 
    center_df
    
# Create a list of the hector cores that we are going to want to run.
hector_cores <- list(newcore(system.file('input/hector_rcp85_constrained.ini', package = 'hector'), name = 'rcp85'))

apply(X = center_df, MARGIN = 1, FUN = param_run_Hector) %>% 
    bind_rows() -> 
    central_output

```

How much do the central points or the paramters change for the clusters based on the differnt methods? 
```{r}
center_df %>% 
    select(S, diff, alpha, volscl, method, cluster) %>% 
    gather(param, value, S, diff, alpha, volscl) %>%  
    ggplot(aes(value, cluster, color = method)) + 
    geom_point(size = 2, alpha = 0.7) + 
    facet_wrap('param', scales = 'free') + 
    labs(title = 'center param values based on the two k methods')

```

whelp it looks like including the EOC temp and heat flux data did not change were the kmeans centers were so it probably will not impact the Hector output either. 


```{r}

ggplot(data = central_output %>% 
           filter(scenario =='rcp85') %>%  
           mutate(metho = factor(method,levels =  c('params', 'params + EOC')))) + 
    geom_line(aes(year, value, color = method, linetype = method, group = interaction(method, cluster, scenario)), 
              size = 0.75) + 
    facet_grid(variable ~ cluster, scales = 'free_y')

```

## Wrap Up 


Well I offically have a case of the Friday brain mush and I am going to head out. I need to still look at the results by grouping the models the physics information, when I was running the calibrations earlier I was having issues getting the runs to converge :( and it looks like when we break up the models by physics we quickly run out of our data requirements (hist tas + future tas)
