BEMA, but in R

Loading Libraries and Raw Data

#library(ggplot2)
#library(tidyr)
#library(dplyr)
utility <- read.csv(utility_URL, header=TRUE)
colnames(utility) <- c('BDBID', "Start_of_Month", "End_of_Month", "AvgDailyElec", "AvgDailyFuel", "DailyTemperatureFile.OAT")
bdbid <- unique(utility$BDBID) 

BREMA Output

old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID) 
brema_df <- NULL

chosen_models_brema <- function(list_of_BDBIDs, utility_file){
  for (bdbid in list_of_BDBIDs) {
    # Pulling in data
    utility <- filter(utility_file, BDBID == bdbid) 
    
    y <- matrix(utility$AvgDailyElec)
    x <- matrix(utility$DailyTemperatureFile.OAT)
    y = matrix(y[sort.list(x)], nrow = length(y))
    x = matrix(x[sort.list(x)], nrow = length(x))
    
    # BREMA 
    models <- run_model(x,y, stat_test_flag = TRUE)
    best_mod <- bestmodel_test(models)
    brema <- best_mod$model
    brema_cp1 <- best_mod$bestmodel.cp1
    model_stats_df <- data.frame(t(best_mod$bestmodel.stats))
    
    brema_temp <- data.frame(bdbid, brema, brema_cp1, model_stats_df)
    brema_df <<- rbind(brema_df, brema_temp)
  }

  return(brema_df)
}

chosen_models_brema(bdbid, utility)
new <- Sys.time() - old 
print(new)
## Time difference of 3.224642 secs
utility_test <- filter(utility, BDBID == 3563)
    y <- matrix(utility_test$AvgDailyElec)
    x <- matrix(utility_test$DailyTemperatureFile.OAT)
    y = matrix(y[sort.list(x)], nrow = length(y))
    x = matrix(x[sort.list(x)], nrow = length(x))
models <- run_model(x,y, stat_test_flag = TRUE)
best <- bestmodel_test(models)
plot_model(x, y, model=best$model, B=best$bestmodel.params, cp1=68.71429)

BREMA iterating through middle 1/3 of interval

old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID) 
brema_df_third <- NULL

chosen_models_brema_thirds <- function(list_of_BDBIDs, utility_file){
  for (bdbid in list_of_BDBIDs) {
    # Pulling in data
    utility <- filter(utility_file, BDBID == bdbid) 
    
    y <- matrix(utility$AvgDailyElec)
    x <- matrix(utility$DailyTemperatureFile.OAT)
    y = matrix(y[sort.list(x)], nrow = length(y))
    x = matrix(x[sort.list(x)], nrow = length(x))
    
    # BREMA 
    models <- run_model_thirds(x,y, stat_test_flag = TRUE)
    best_mod <- bestmodel_test(models)
    brema <- best_mod$model
    brema_cp1 <- best_mod$bestmodel.cp1
    model_stats_df <- data.frame(t(best_mod$bestmodel.stats))
    
    brema_temp <- data.frame(bdbid, brema, brema_cp1, model_stats_df)
    brema_df_third <<- rbind(brema_df, brema_temp)
  }

  return(head(brema_df_third, n=6))
}

chosen_models_brema_thirds(bdbid, utility)
new <- Sys.time() - old 
print(new)
## Time difference of 2.020427 secs
utility_test <- filter(utility, BDBID == 3563)
    y <- matrix(utility_test$AvgDailyElec)
    x <- matrix(utility_test$DailyTemperatureFile.OAT)
    y = matrix(y[sort.list(x)], nrow = length(y))
    x = matrix(x[sort.list(x)], nrow = length(x))
models <- run_model_thirds(x,y, stat_test_flag = TRUE)
best <- bestmodel_test(models)
plot_model(x, y, model=best$model, B=best$bestmodel.params, cp1=68.71429)

BREMA optimizing for right slope RMSE

old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID) 
brema_right_df <- NULL

chosen_models_brema_right <- function(list_of_BDBIDs, utility_file){
  for (bdbid in list_of_BDBIDs) {
    # Pulling in data
    utility <- filter(utility_file, BDBID == bdbid) 
    
    y <- matrix(utility$AvgDailyElec)
    x <- matrix(utility$DailyTemperatureFile.OAT)
    y = matrix(y[sort.list(x)], nrow = length(y))
    x = matrix(x[sort.list(x)], nrow = length(x))
    
    # BREMA with Left Slope RMSE test change
    right_RMSE <- run_model_right(x,y, stat_test_flag = TRUE)
    best_mod_right <- bestmodel_test(right_RMSE)
    right_RMSE_test <- best_mod_right$model
    right_RMSE_cp1 <- best_mod_right$bestmodel.cp1
    stats_right_df <- data.frame(t(best_mod_right$bestmodel.stats))
    
    brema_right_temp <- data.frame(bdbid, right_RMSE_test,right_RMSE_cp1,stats_right_df)
    brema_right_df <<- rbind(brema_right_df, brema_right_temp)
  }

  return(brema_right_df)
}

chosen_models_brema_right(bdbid, utility)
new <- Sys.time() - old 
print(new)
## Time difference of 3.222642 secs
utility_test <- filter(utility, BDBID == 3563)
    y <- matrix(utility_test$AvgDailyElec)
    x <- matrix(utility_test$DailyTemperatureFile.OAT)
    y = matrix(y[sort.list(x)], nrow = length(y))
    x = matrix(x[sort.list(x)], nrow = length(x))
right_RMSE <- run_model_right(x,y, stat_test_flag = TRUE)
best <- bestmodel_test(right_RMSE)
plot_model(x, y, model=best$model, B=best$bestmodel.params, cp1=70.71429)

BREMA optimizing for left slope RMSE

old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID) 
brema_left_df <- NULL

chosen_models_brema_left <- function(list_of_BDBIDs, utility_file){
  for (bdbid in list_of_BDBIDs) {
    # Pulling in data
    utility <- filter(utility_file, BDBID == bdbid) 
    
    y <- matrix(utility$AvgDailyElec)
    x <- matrix(utility$DailyTemperatureFile.OAT)
    y = matrix(y[sort.list(x)], nrow = length(y))
    x = matrix(x[sort.list(x)], nrow = length(x))
    
    # BREMA with Left Slope RMSE test change
    left_RMSE <- run_model_left(x,y, stat_test_flag = TRUE)
    best_mod_left <- bestmodel_test(left_RMSE)
    left_RMSE_test <- best_mod_left$model
    left_RMSE_cp1 <- best_mod_left$bestmodel.cp1
    stats_left_df <- data.frame(t(best_mod_left$bestmodel.stats))
    
    brema_left_temp <- data.frame(bdbid, left_RMSE_test,left_RMSE_cp1, stats_left_df)
    brema_left_df <<- rbind(brema_left_df, brema_left_temp)
  }

  return(brema_left_df)
}

chosen_models_brema_left(bdbid, utility)
new <- Sys.time() - old 
print(new)
## Time difference of 3.259843 secs
utility_test <- filter(utility, BDBID == 3563)
    y <- matrix(utility_test$AvgDailyElec)
    x <- matrix(utility_test$DailyTemperatureFile.OAT)
    y = matrix(y[sort.list(x)], nrow = length(y))
    x = matrix(x[sort.list(x)], nrow = length(x))
left_RMSE <- run_model_left(x,y, stat_test_flag = TRUE)
best <- bestmodel_test(left_RMSE)
plot_model(x, y, model=best$model, B=best$bestmodel.params, cp1=68.21429)

BREMA using AIC model

old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID) 
final_aic_df <- NULL

chosen_models_AIC <- function(list_of_BDBIDs, utility_file){
  for (bdbid in list_of_BDBIDs) {
    # Pulling in data
    utility <- filter(utility_file, BDBID == bdbid) 
    
    y <- matrix(utility$AvgDailyElec)
    x <- matrix(utility$DailyTemperatureFile.OAT)
    y = matrix(y[sort.list(x)], nrow = length(y))
    x = matrix(x[sort.list(x)], nrow = length(x))
    
    aic_model_stats <- one_model_aic(x,y)
    
    model_aic_df <- data.frame(t(aic_model_stats$stats))
    cp1 <- as.vector(t(aic_model_stats$cp1))
    cp2 <- as.vector(t(aic_model_stats$cp2))
    bdbid <- as.vector(bdbid)
    model_t <- t(aic_model_stats$model)

    model_aic_df <- mutate(model_aic_df, bdbid) 
    model_aic_df <- mutate(model_aic_df, cp1) 
    model_aic_df <- mutate(model_aic_df, cp2)
    model_aic_df <- mutate(model_aic_df, model_t)
    final_aic_df <<- data.frame(rbind(final_aic_df, model_aic_df))
  }

  print(final_aic_df)
}

chosen_models_AIC(bdbid, utility)
##          RMSE  Rsquared   CV_RMSE          SSE          SST          MSE
## 1 0.002547757 0.4140358 10.165165 1.557856e-04 0.0002658620 6.491068e-06
## 2 0.006097326 0.9505149 15.248152 8.922572e-04 0.0180308190 3.717738e-05
## 3 0.002175359 0.6577816  8.503088 1.135724e-04 0.0003318712 4.732185e-06
## 4 0.002281068 0.7499153  6.013158 1.248785e-04 0.0004993450 5.203272e-06
## 5 0.001713219 0.9043812 10.042950 7.044287e-05 0.0007367053 2.935120e-06
## 6 0.010219084 0.2506609 21.325703 2.506312e-03 0.0033446970 1.044297e-04
##   bdbid      cp1 cp2 aic_min_model
## 1  3526 65.21429   0            4P
## 2  3563 68.71429   0           3PC
## 3  3564 59.71429   0            4P
## 4  3744 67.71429   0           3PC
## 5  3815 62.71429   0           3PC
## 6  4478 57.71429   0            4P
new <- Sys.time() - old 
print(new)
## Time difference of 3.334844 secs
utility_test <- filter(utility, BDBID == 3563)
    y <- matrix(utility_test$AvgDailyElec)
    x <- matrix(utility_test$DailyTemperatureFile.OAT)
    y = matrix(y[sort.list(x)], nrow = length(y))
    x = matrix(x[sort.list(x)], nrow = length(x))
aic_model_stats <- one_model_aic(x,y)
plot_model(x, y, model=aic_model_stats$model, B=aic_model_stats$params, cp1=68.71429)

BREMA using BIC model

old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID) 
final_bic_df <- NULL

chosen_models_BIC <- function(list_of_BDBIDs, utility_file){
  for (bdbid in list_of_BDBIDs) {
    # Pulling in data
    utility <- filter(utility_file, BDBID == bdbid) 
    
    y <- matrix(utility$AvgDailyElec)
    x <- matrix(utility$DailyTemperatureFile.OAT)
    y = matrix(y[sort.list(x)], nrow = length(y))
    x = matrix(x[sort.list(x)], nrow = length(x))
    
    bic_model_stats <- one_model_bic(x,y)
    
    model_bic_df <- data.frame(t(bic_model_stats$stats))
    cp1 <- as.vector(t(bic_model_stats$cp1))
    cp2 <- as.vector(t(bic_model_stats$cp2))
    bdbid <- as.vector(bdbid)
    model_t <- t(bic_model_stats$model)

    model_bic_df <- mutate(model_bic_df, bdbid) 
    model_bic_df <- mutate(model_bic_df, cp1) 
    model_bic_df <- mutate(model_bic_df, cp2)
    model_bic_df <- mutate(model_bic_df, model_t)
    final_bic_df <<- data.frame(rbind(final_bic_df, model_bic_df))
  }

  print(final_bic_df)
}

chosen_models_BIC(bdbid, utility)
##          RMSE    Rsquared   CV_RMSE          SSE          SST          MSE
## 1 0.003325666 0.001582405 13.268903 2.654413e-04 0.0002658620 1.106006e-05
## 2 0.006097326 0.950514884 15.248152 8.922572e-04 0.0180308190 3.717738e-05
## 3 0.003630377 0.046885665 14.190493 3.163112e-04 0.0003318712 1.317963e-05
## 4 0.003229289 0.498784604  8.512778 2.502794e-04 0.0004993450 1.042831e-05
## 5 0.001713219 0.904381208 10.042950 7.044287e-05 0.0007367053 2.935120e-06
## 6 0.011506836 0.049906634 24.013049 3.177774e-03 0.0033446970 1.324073e-04
##   bdbid      cp1 cp2 bic_min_model
## 1  3526  0.00000   0            2P
## 2  3563 68.71429   0           3PC
## 3  3564  0.00000   0            2P
## 4  3744  0.00000   0            2P
## 5  3815 62.71429   0           3PC
## 6  4478  0.00000   0            2P
new <- Sys.time() - old 
print(new)
## Time difference of 3.280444 secs
utility_test <- filter(utility, BDBID == 3563)
    y <- matrix(utility_test$AvgDailyElec)
    x <- matrix(utility_test$DailyTemperatureFile.OAT)
    y = matrix(y[sort.list(x)], nrow = length(y))
    x = matrix(x[sort.list(x)], nrow = length(x))
bic_model_stats <- one_model_bic(x,y)
plot_model(x, y, model=bic_model_stats$model, B=bic_model_stats$params, cp1=0)

BREMA Clusters

old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID) 
cluster_df = NULL
    
dataframe_cluster_electricity <- function(list_of_BDBIDs, utility_test){
  for (bdbid in list_of_BDBIDs) {
    utility <- filter(utility_test, BDBID == bdbid)
      y <- matrix(utility$AvgDailyElec)
      x <- matrix(utility$DailyTemperatureFile.OAT)
      y = matrix(y[sort.list(x)], nrow = length(y))
      x = matrix(x[sort.list(x)], nrow = length(x))
    
    clustered_list <- cluster_model(x,y)
    clustered_df <- clustered_list$clustered
    model_cluster <- create_cluster(clustered_list)
    model = model_decision(clustered_list)
    
    model_stats_df <- data.frame(t(model_cluster$stats))
    cp1 <- as.vector(t(model_cluster$cp1))
    cp2 <- as.vector(t(model_cluster$cp2))
    bdbid <- as.vector(bdbid)
    model_t <- t(model_cluster$model)

    model_stats_df <- mutate(model_stats_df, bdbid) 
    model_stats_df <- mutate(model_stats_df, cp1) 
    model_stats_df <- mutate(model_stats_df, cp2)
    model_stats_df <- mutate(model_stats_df, model_t)
    cluster_df <<- data.frame(rbind(cluster_df, model_stats_df))
  }
    cluster_df <- tail(cluster_df[,c(7,8,9,10,1,2,3,4,5,6)], n=6)
    return (cluster_df)
}

dataframe_cluster_electricity(bdbid, utility)
new <- Sys.time() - old 
print(new)
## Time difference of 0.2702041 secs
utility_test <- filter(utility, BDBID == 3563)
    y <- matrix(utility_test$AvgDailyElec)
    x <- matrix(utility_test$DailyTemperatureFile.OAT)
    y = matrix(y[sort.list(x)], nrow = length(y))
    x = matrix(x[sort.list(x)], nrow = length(x))
clustered_list <- cluster_model(x,y)
clustered_df <- clustered_list$clustered
model_cluster <- create_cluster(clustered_list)
model = model_decision(clustered_list)
plot_model(x, y, model[1], B=model_cluster$params, cp1=64)

Compare All Models

compare

Preliminary Conclusions

Changing our interval to 1/3 decreases time and does not result in any changes to our model type or change point decision making.

data.frame(bdbid, brema_model, brema_cp1, brema_thirds_model, brema_thirds_cp1)

Optimizing for RMSE using the left or right slopes appears to result in higher model selection. Noticable increase in 4P and 5P models chosen using this metric.

data.frame(bdbid, brema_model, brema_cp1, brema_right_model, brema_right_cp1, brema_left_model, brema_left_cp1)

The AIC and BIC statistical metric are meant to provide a means model selection. BIC does the same, but is penalizes parameters more, so in general AIC tends to pick larger models. This needs to be tested further.

data.frame(bdbid, brema_model, brema_cp1, brema_aic_model, brema_aic_cp1, brema_bic_model, brema_bic_cp1)

Clustering reduced speeds significantly (3 seconds vs .4 seconds). Clustering tends to choose two clusters, and then uses a t-test and shape test to determine which model is the best.

data.frame(bdbid, brema_model, brema_cp1, brema_cluster_model, brema_cluster_cp1)