#library(ggplot2)
#library(tidyr)
#library(dplyr)
utility <- read.csv(utility_URL, header=TRUE)
colnames(utility) <- c('BDBID', "Start_of_Month", "End_of_Month", "AvgDailyElec", "AvgDailyFuel", "DailyTemperatureFile.OAT")
bdbid <- unique(utility$BDBID)
old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID)
brema_df <- NULL
chosen_models_brema <- function(list_of_BDBIDs, utility_file){
for (bdbid in list_of_BDBIDs) {
# Pulling in data
utility <- filter(utility_file, BDBID == bdbid)
y <- matrix(utility$AvgDailyElec)
x <- matrix(utility$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
# BREMA
models <- run_model(x,y, stat_test_flag = TRUE)
best_mod <- bestmodel_test(models)
brema <- best_mod$model
brema_cp1 <- best_mod$bestmodel.cp1
model_stats_df <- data.frame(t(best_mod$bestmodel.stats))
brema_temp <- data.frame(bdbid, brema, brema_cp1, model_stats_df)
brema_df <<- rbind(brema_df, brema_temp)
}
return(brema_df)
}
chosen_models_brema(bdbid, utility)
new <- Sys.time() - old
print(new)
## Time difference of 3.224642 secs
utility_test <- filter(utility, BDBID == 3563)
y <- matrix(utility_test$AvgDailyElec)
x <- matrix(utility_test$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
models <- run_model(x,y, stat_test_flag = TRUE)
best <- bestmodel_test(models)
plot_model(x, y, model=best$model, B=best$bestmodel.params, cp1=68.71429)
old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID)
brema_df_third <- NULL
chosen_models_brema_thirds <- function(list_of_BDBIDs, utility_file){
for (bdbid in list_of_BDBIDs) {
# Pulling in data
utility <- filter(utility_file, BDBID == bdbid)
y <- matrix(utility$AvgDailyElec)
x <- matrix(utility$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
# BREMA
models <- run_model_thirds(x,y, stat_test_flag = TRUE)
best_mod <- bestmodel_test(models)
brema <- best_mod$model
brema_cp1 <- best_mod$bestmodel.cp1
model_stats_df <- data.frame(t(best_mod$bestmodel.stats))
brema_temp <- data.frame(bdbid, brema, brema_cp1, model_stats_df)
brema_df_third <<- rbind(brema_df, brema_temp)
}
return(head(brema_df_third, n=6))
}
chosen_models_brema_thirds(bdbid, utility)
new <- Sys.time() - old
print(new)
## Time difference of 2.020427 secs
utility_test <- filter(utility, BDBID == 3563)
y <- matrix(utility_test$AvgDailyElec)
x <- matrix(utility_test$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
models <- run_model_thirds(x,y, stat_test_flag = TRUE)
best <- bestmodel_test(models)
plot_model(x, y, model=best$model, B=best$bestmodel.params, cp1=68.71429)
old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID)
brema_right_df <- NULL
chosen_models_brema_right <- function(list_of_BDBIDs, utility_file){
for (bdbid in list_of_BDBIDs) {
# Pulling in data
utility <- filter(utility_file, BDBID == bdbid)
y <- matrix(utility$AvgDailyElec)
x <- matrix(utility$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
# BREMA with Left Slope RMSE test change
right_RMSE <- run_model_right(x,y, stat_test_flag = TRUE)
best_mod_right <- bestmodel_test(right_RMSE)
right_RMSE_test <- best_mod_right$model
right_RMSE_cp1 <- best_mod_right$bestmodel.cp1
stats_right_df <- data.frame(t(best_mod_right$bestmodel.stats))
brema_right_temp <- data.frame(bdbid, right_RMSE_test,right_RMSE_cp1,stats_right_df)
brema_right_df <<- rbind(brema_right_df, brema_right_temp)
}
return(brema_right_df)
}
chosen_models_brema_right(bdbid, utility)
new <- Sys.time() - old
print(new)
## Time difference of 3.222642 secs
utility_test <- filter(utility, BDBID == 3563)
y <- matrix(utility_test$AvgDailyElec)
x <- matrix(utility_test$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
right_RMSE <- run_model_right(x,y, stat_test_flag = TRUE)
best <- bestmodel_test(right_RMSE)
plot_model(x, y, model=best$model, B=best$bestmodel.params, cp1=70.71429)
old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID)
brema_left_df <- NULL
chosen_models_brema_left <- function(list_of_BDBIDs, utility_file){
for (bdbid in list_of_BDBIDs) {
# Pulling in data
utility <- filter(utility_file, BDBID == bdbid)
y <- matrix(utility$AvgDailyElec)
x <- matrix(utility$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
# BREMA with Left Slope RMSE test change
left_RMSE <- run_model_left(x,y, stat_test_flag = TRUE)
best_mod_left <- bestmodel_test(left_RMSE)
left_RMSE_test <- best_mod_left$model
left_RMSE_cp1 <- best_mod_left$bestmodel.cp1
stats_left_df <- data.frame(t(best_mod_left$bestmodel.stats))
brema_left_temp <- data.frame(bdbid, left_RMSE_test,left_RMSE_cp1, stats_left_df)
brema_left_df <<- rbind(brema_left_df, brema_left_temp)
}
return(brema_left_df)
}
chosen_models_brema_left(bdbid, utility)
new <- Sys.time() - old
print(new)
## Time difference of 3.259843 secs
utility_test <- filter(utility, BDBID == 3563)
y <- matrix(utility_test$AvgDailyElec)
x <- matrix(utility_test$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
left_RMSE <- run_model_left(x,y, stat_test_flag = TRUE)
best <- bestmodel_test(left_RMSE)
plot_model(x, y, model=best$model, B=best$bestmodel.params, cp1=68.21429)
old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID)
final_aic_df <- NULL
chosen_models_AIC <- function(list_of_BDBIDs, utility_file){
for (bdbid in list_of_BDBIDs) {
# Pulling in data
utility <- filter(utility_file, BDBID == bdbid)
y <- matrix(utility$AvgDailyElec)
x <- matrix(utility$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
aic_model_stats <- one_model_aic(x,y)
model_aic_df <- data.frame(t(aic_model_stats$stats))
cp1 <- as.vector(t(aic_model_stats$cp1))
cp2 <- as.vector(t(aic_model_stats$cp2))
bdbid <- as.vector(bdbid)
model_t <- t(aic_model_stats$model)
model_aic_df <- mutate(model_aic_df, bdbid)
model_aic_df <- mutate(model_aic_df, cp1)
model_aic_df <- mutate(model_aic_df, cp2)
model_aic_df <- mutate(model_aic_df, model_t)
final_aic_df <<- data.frame(rbind(final_aic_df, model_aic_df))
}
print(final_aic_df)
}
chosen_models_AIC(bdbid, utility)
## RMSE Rsquared CV_RMSE SSE SST MSE
## 1 0.002547757 0.4140358 10.165165 1.557856e-04 0.0002658620 6.491068e-06
## 2 0.006097326 0.9505149 15.248152 8.922572e-04 0.0180308190 3.717738e-05
## 3 0.002175359 0.6577816 8.503088 1.135724e-04 0.0003318712 4.732185e-06
## 4 0.002281068 0.7499153 6.013158 1.248785e-04 0.0004993450 5.203272e-06
## 5 0.001713219 0.9043812 10.042950 7.044287e-05 0.0007367053 2.935120e-06
## 6 0.010219084 0.2506609 21.325703 2.506312e-03 0.0033446970 1.044297e-04
## bdbid cp1 cp2 aic_min_model
## 1 3526 65.21429 0 4P
## 2 3563 68.71429 0 3PC
## 3 3564 59.71429 0 4P
## 4 3744 67.71429 0 3PC
## 5 3815 62.71429 0 3PC
## 6 4478 57.71429 0 4P
new <- Sys.time() - old
print(new)
## Time difference of 3.334844 secs
utility_test <- filter(utility, BDBID == 3563)
y <- matrix(utility_test$AvgDailyElec)
x <- matrix(utility_test$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
aic_model_stats <- one_model_aic(x,y)
plot_model(x, y, model=aic_model_stats$model, B=aic_model_stats$params, cp1=68.71429)
old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID)
final_bic_df <- NULL
chosen_models_BIC <- function(list_of_BDBIDs, utility_file){
for (bdbid in list_of_BDBIDs) {
# Pulling in data
utility <- filter(utility_file, BDBID == bdbid)
y <- matrix(utility$AvgDailyElec)
x <- matrix(utility$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
bic_model_stats <- one_model_bic(x,y)
model_bic_df <- data.frame(t(bic_model_stats$stats))
cp1 <- as.vector(t(bic_model_stats$cp1))
cp2 <- as.vector(t(bic_model_stats$cp2))
bdbid <- as.vector(bdbid)
model_t <- t(bic_model_stats$model)
model_bic_df <- mutate(model_bic_df, bdbid)
model_bic_df <- mutate(model_bic_df, cp1)
model_bic_df <- mutate(model_bic_df, cp2)
model_bic_df <- mutate(model_bic_df, model_t)
final_bic_df <<- data.frame(rbind(final_bic_df, model_bic_df))
}
print(final_bic_df)
}
chosen_models_BIC(bdbid, utility)
## RMSE Rsquared CV_RMSE SSE SST MSE
## 1 0.003325666 0.001582405 13.268903 2.654413e-04 0.0002658620 1.106006e-05
## 2 0.006097326 0.950514884 15.248152 8.922572e-04 0.0180308190 3.717738e-05
## 3 0.003630377 0.046885665 14.190493 3.163112e-04 0.0003318712 1.317963e-05
## 4 0.003229289 0.498784604 8.512778 2.502794e-04 0.0004993450 1.042831e-05
## 5 0.001713219 0.904381208 10.042950 7.044287e-05 0.0007367053 2.935120e-06
## 6 0.011506836 0.049906634 24.013049 3.177774e-03 0.0033446970 1.324073e-04
## bdbid cp1 cp2 bic_min_model
## 1 3526 0.00000 0 2P
## 2 3563 68.71429 0 3PC
## 3 3564 0.00000 0 2P
## 4 3744 0.00000 0 2P
## 5 3815 62.71429 0 3PC
## 6 4478 0.00000 0 2P
new <- Sys.time() - old
print(new)
## Time difference of 3.280444 secs
utility_test <- filter(utility, BDBID == 3563)
y <- matrix(utility_test$AvgDailyElec)
x <- matrix(utility_test$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
bic_model_stats <- one_model_bic(x,y)
plot_model(x, y, model=bic_model_stats$model, B=bic_model_stats$params, cp1=0)
old <- Sys.time()
utility <- read.csv(utility_URL, header=TRUE)
bdbid <- unique(utility$BDBID)
cluster_df = NULL
dataframe_cluster_electricity <- function(list_of_BDBIDs, utility_test){
for (bdbid in list_of_BDBIDs) {
utility <- filter(utility_test, BDBID == bdbid)
y <- matrix(utility$AvgDailyElec)
x <- matrix(utility$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
clustered_list <- cluster_model(x,y)
clustered_df <- clustered_list$clustered
model_cluster <- create_cluster(clustered_list)
model = model_decision(clustered_list)
model_stats_df <- data.frame(t(model_cluster$stats))
cp1 <- as.vector(t(model_cluster$cp1))
cp2 <- as.vector(t(model_cluster$cp2))
bdbid <- as.vector(bdbid)
model_t <- t(model_cluster$model)
model_stats_df <- mutate(model_stats_df, bdbid)
model_stats_df <- mutate(model_stats_df, cp1)
model_stats_df <- mutate(model_stats_df, cp2)
model_stats_df <- mutate(model_stats_df, model_t)
cluster_df <<- data.frame(rbind(cluster_df, model_stats_df))
}
cluster_df <- tail(cluster_df[,c(7,8,9,10,1,2,3,4,5,6)], n=6)
return (cluster_df)
}
dataframe_cluster_electricity(bdbid, utility)
new <- Sys.time() - old
print(new)
## Time difference of 0.2702041 secs
utility_test <- filter(utility, BDBID == 3563)
y <- matrix(utility_test$AvgDailyElec)
x <- matrix(utility_test$DailyTemperatureFile.OAT)
y = matrix(y[sort.list(x)], nrow = length(y))
x = matrix(x[sort.list(x)], nrow = length(x))
clustered_list <- cluster_model(x,y)
clustered_df <- clustered_list$clustered
model_cluster <- create_cluster(clustered_list)
model = model_decision(clustered_list)
plot_model(x, y, model[1], B=model_cluster$params, cp1=64)
compare
Changing our interval to 1/3 decreases time and does not result in any changes to our model type or change point decision making.
data.frame(bdbid, brema_model, brema_cp1, brema_thirds_model, brema_thirds_cp1)
Optimizing for RMSE using the left or right slopes appears to result in higher model selection. Noticable increase in 4P and 5P models chosen using this metric.
data.frame(bdbid, brema_model, brema_cp1, brema_right_model, brema_right_cp1, brema_left_model, brema_left_cp1)
The AIC and BIC statistical metric are meant to provide a means model selection. BIC does the same, but is penalizes parameters more, so in general AIC tends to pick larger models. This needs to be tested further.
data.frame(bdbid, brema_model, brema_cp1, brema_aic_model, brema_aic_cp1, brema_bic_model, brema_bic_cp1)
Clustering reduced speeds significantly (3 seconds vs .4 seconds). Clustering tends to choose two clusters, and then uses a t-test and shape test to determine which model is the best.
data.frame(bdbid, brema_model, brema_cp1, brema_cluster_model, brema_cluster_cp1)