Power Plants - Predicting Power
MISC
Introduction
This data has been gathered at two solar power plants in India over a 34 day period. It has two pairs of files - each pair has one power generation dataset and one sensor readings dataset. The power generation datasets are gathered at the inverter level - each inverter has multiple lines of solar panels attached to it. The sensor data is gathered at a plant level - single array of sensors optimally placed at the plant.
Business Need
Can we predict the power generation for next couple of days? This will help with better grid management
Statistical Method
- facebook time-series PROPHET modeling
- at the aggregate plant level (all inverters)
- at the individual inverter level, and then rollup (aggregate sum) to power plant level**
Power Generation Inverter data set: Plant 1
Get and Clean Data
p1.gd = read_csv('Plant_1_Generation_Data.csv') %>%
slice_sample(prop = 1) %>% #!!<NOTE>temp, working with a sample of datset for speed purposes
clean_names() %>% #lowercase
select(sort(tidyselect::peek_vars())) %>% #sort cols alphabetically
select(where(is.factor),where(is.character),where(is.numeric)) #sort cols by data type
#OlsonNames()
#https://stackoverflow.com/questions/41479008/what-is-the-correct-tz-database-time-zone-for-india
p1.gd = p1.gd %>% mutate(
date_time = as.POSIXct(strptime(p1.gd$date_time, "%d-%m-%Y %H:%M"), tz = 'Asia/Kolkata'),
source_key = factor(p1.gd$source_key)
) %>% rename(inverter = source_key) %>%
arrange(date_time)
p1.gd$plant_id = NULLglimpse structure
## Rows: 68,778
## Columns: 6
## $ date_time <dttm> 2020-05-15 00:00:00, 2020-05-15 00:00:00, 2020-05-15 0...
## $ inverter <fct> 3PZuoBAID5Wc2HD, bvBOhCH3iADSZry, ih0vzX44oOqAx2f, 1IF5...
## $ ac_power <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ daily_yield <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ dc_power <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ total_yield <dbl> 6987759, 6316803, 6185184, 6183645, 7098099, 7038681, 7...
EDA
counts of each factor’s unique levels
## .
## inverter 22
reference: names of unique levels
## inverter
## 1 3PZuoBAID5Wc2HD
## 2 bvBOhCH3iADSZry
## 3 ih0vzX44oOqAx2f
## 4 1IF53ai7Xc0U56Y
## 5 ZoEaEvLYb1n2sOq
## 6 uHbuxQJl8lW7ozc
## 7 WRmjgnKYAwPKWDb
## 8 7JYdWkrLSPkdwr4
## 9 VHMLBKoKgIrUVDU
## 10 z9Y9gH1T5YWrNuG
## 11 iCRJl6heRkivqQ3
## 12 ZnxXDlPa8U1GXgE
## 13 adLQvlD726eNBSB
## 14 rGa61gmuvPhdLxV
## 15 1BY6WEcLGh8j5v7
## 16 McdE0feGgRqW7Ca
## 17 wCURE6d3bPkepu2
## 18 pkci93gMrogZuBj
## 19 sjndEbLyjtCKgGv
## 20 zBIq5rxdHJRwDNY
## 21 zVJPv84UY57bAof
## 22 YxYtjZvoooNbGkE
time series/anomoly Plot
library(anomalize)
# anomalize(data, target, method = c("iqr", "gesd"), alpha = 0.05, max_anoms = 0.2, verbose = FALSE)
# alpha: Controls the width of the "normal" range. Lower values are more conservative while higher values are less prone to incorrectly classifying "normal" observations.
# max_anoms: The maximum percent of anomalies permitted to be identified.
p1.gd.anomalize = p1.gd %>% arrange(date_time) %>%
mutate(inverter = fct_reorder(inverter, -daily_yield)) %>%
group_by(inverter) %>%
time_decompose(daily_yield, method = 'twitter', merge = TRUE) %>%
anomalize(remainder, alpha = 0.05, method = 'gesd') %>%
time_recompose()
p1.gd.anomalize %>%
plot_anomalies(
ncol = 1,
alpha_dots = 0.5,
alpha_circles = 0.5,
size_circles = 1.5,
time_recomposed = TRUE,
alpha_ribbon = 0.05
) + scale_y_continuous(labels = comma) +
labs(x = '', y = 'daily_yield', title = 'daily_yield')finalizing Data Set
Forecasting at the Aggregate Level
aggregating total power at the daily level
p1.agg = p1.gd %>% group_by(date_time) %>% summarise(daily_yield = sum(daily_yield), .groups = 'drop')
p1.agg %>% plot_ly(x = ~date_time, y = ~daily_yield) %>% add_lines() %>% layout(title = 'Plant 1 total daily_yield KW')## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Create Model
#renaming cols to prophet's col conventions
p1.agg = p1.agg %>% select(ds = date_time, y = daily_yield)
#creating model
p1.agg.model = p1.agg %>% prophet()
#using model make future period df
p1.agg.future = p1.agg.model %>% make_future_dataframe(
periods = 96 * 7, #there are 96 fifteen mintue periods in one day, so this represents 7 days
freq = 15 * 60) # data is at the 15 minute frequency level (freq is in seconds)
#make forecasts df
p1.agg.forecast = p1.agg.model %>% predict(p1.agg.future)
#plot forecast
p1.agg.model %>% dyplot.prophet(p1.agg.forecast)## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Power Generation Inverter data set: Plant 2
Get and Clean Data
p2.gd = read_csv('Plant_2_Generation_Data.csv') %>%
slice_sample(prop = 1) %>% #!!<NOTE>temp, working with a sample of datset for speed purposes
clean_names() %>% #lowercase
select(sort(tidyselect::peek_vars())) %>% #sort cols alphabetically
select(where(is.POSIXct), where(is.factor),where(is.character),where(is.numeric)) #sort cols by data type
#OlsonNames()
#https://stackoverflow.com/questions/41479008/what-is-the-correct-tz-database-time-zone-for-india
p2.gd = p2.gd %>% mutate(
#date_time = as.POSIXct(strptime(p2.gd$date_time, "%d-%m-%Y %H:%M"), tz = 'Asia/Kolkata'),
source_key = factor(p2.gd$source_key)
) %>% rename(inverter = source_key) %>%
arrange(date_time)
p2.gd$plant_id = NULLglimpse structure
## Rows: 67,698
## Columns: 6
## $ date_time <dttm> 2020-05-15 00:00:00, 2020-05-15 00:00:00, 2020-05-15 0...
## $ inverter <fct> xoJJ8DcxJEcupym, Quc1TzYxW2pYoWX, WcxssY2VbP4hApt, q49J...
## $ ac_power <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ daily_yield <dbl> 0.0000, 5495.0000, 0.0000, 4315.0000, 651.2000, 0.0000,...
## $ dc_power <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ total_yield <dbl> 209143593, 329509085, 181695261, 339923, 1348350801, 22...
EDA
counts of each factor’s unique levels
## .
## inverter 22
reference: names of unique levels
## inverter
## 1 xoJJ8DcxJEcupym
## 2 Quc1TzYxW2pYoWX
## 3 WcxssY2VbP4hApt
## 4 q49J1IKaHRwDQnt
## 5 PeE6FRyGXUgsRhN
## 6 vOuJvMaM2sgwLmb
## 7 oZZkBaNadn6DNKz
## 8 Qf4GUc1pJu5T6c6
## 9 xMbIugepa2P7lBB
## 10 81aHJ1q11NBPMrL
## 11 NgDl19wMapZy17u
## 12 LYwnQax7tkwH5Cb
## 13 V94E5Ben1TlhnDV
## 14 IQ2d7wF4YD8zU1Q
## 15 oZ35aAeoifZaQzV
## 16 Mx2yZCDsyf6DPfv
## 17 9kRcWv60rDACzjR
## 18 rrq4fwE8jgrTyWY
## 19 mqwcsP2rE7J0TFp
## 20 4UPUqMRk7TRMgml
## 21 Et9kgGMDl729KT4
## 22 LlT2YUhhzqhg5Sw
time series/anomoly Plot
library(anomalize)
# anomalize(data, target, method = c("iqr", "gesd"), alpha = 0.05, max_anoms = 0.2, verbose = FALSE)
# alpha: Controls the width of the "normal" range. Lower values are more conservative while higher values are less prone to incorrectly classifying "normal" observations.
# max_anoms: The maximum percent of anomalies permitted to be identified.
p2.gd.anomalize = p2.gd %>% arrange(date_time) %>%
mutate(inverter = fct_reorder(inverter, -daily_yield)) %>%
group_by(inverter) %>%
time_decompose(daily_yield, method = 'twitter', merge = TRUE) %>%
anomalize(remainder, alpha = 0.05, method = 'gesd') %>%
time_recompose()
p2.gd.anomalize %>%
plot_anomalies(
ncol = 1,
alpha_dots = 0.5,
alpha_circles = 0.5,
size_circles = 1.5,
time_recomposed = TRUE,
alpha_ribbon = 0.05
) + scale_y_continuous(labels = comma) +
labs(x = '', y = 'daily_yield', title = 'daily_yield')identify the 4 faulty inverters
## # A tibble: 4 x 2
## inverter n
## <fct> <int>
## 1 IQ2d7wF4YD8zU1Q 2355
## 2 mqwcsP2rE7J0TFp 2355
## 3 NgDl19wMapZy17u 2355
## 4 xMbIugepa2P7lBB 2355
p2.faulty.inverters = p2.gd %>% count(inverter) %>% arrange(n) %>% head(4) %>% select(inverter) %>% pull %>% as.character- ‘good’ inverters (NON-faulty inverters that didn’t have complete shutdowns)
- ‘bad’ inverters (FAULTY inverters that had complete shutdowns)
finalizing Data Sets
#filter to only good inverters
p2.gd.good = p2.gd %>% filter(!inverter %in% p2.faulty.inverters)
#filter to only bad inverters
p2.gd.bad = p2.gd %>% filter(inverter %in% p2.faulty.inverters)Forecasting at the Aggregate Level for ‘good’ inverters
aggregating total power at the daily level
Create Model
#renaming cols to prophet's col conventions
p2.agg.good = p2.agg.good %>% select(ds = date_time, y = daily_yield)
#creating model
p2.agg.good.model = p2.agg.good %>% prophet()## Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.
#using model make future period df
p2.agg.good.future = p2.agg.good.model %>% make_future_dataframe(
periods = 96 * 7, #there are 96 fifteen mintue periods in one day, so this represents 7 days
freq = 15 * 60,
include_history = TRUE
) # data is at the 15 minute frequency level (freq is in seconds)
#make forecasts df
p2.agg.good.forecast = p2.agg.good.model %>% predict(p2.agg.good.future)
p2.agg.good.forecast %>% head %>% DT::datatable()Forecasting at the Aggregate Level for ‘bad’ inverters
aggregating total power at the daily level
p2.agg.bad = p2.gd.bad %>% group_by(date_time) %>% summarise(daily_yield = sum(daily_yield), .groups = 'drop')
p2.agg.bad %>% plot_ly(x = ~date_time, y = ~daily_yield, color = I('darkorange')) %>% add_lines() %>% layout(title = 'Plant 2 bad inverters total daily_yield KW')Create Model
#renaming cols to prophet's col conventions
p2.agg.bad = p2.agg.bad %>% filter(date_time >= '2020-05-31 UTC') %>% select(ds = date_time, y = daily_yield)
#creating model
p2.agg.bad.model = p2.agg.bad %>% prophet()
#using model make future period df
p2.agg.bad.future = p2.agg.bad.model %>% make_future_dataframe(
periods = 96 * 7, #there are 96 fifteen mintue periods in one day, so this represents 7 days
freq = 15 * 60,
include_history = TRUE
) # data is at the 15 minute frequency level (freq is in seconds)
#make forecasts df
p2.agg.bad.forecast = p2.agg.bad.model %>% predict(p2.agg.bad.future)
p2.agg.bad.forecast %>% head %>% DT::datatable()#plot forecast
p2.agg.bad.model %>% dyplot.prophet(p2.agg.bad.forecast) %>% dygraphs::dyOptions(colors = c('darkorange','blue'))