Renewable Energy Sources in Poland

Introduction

This is a dataset from Local Bank Data GUS.

Source: https://bdl.stat.gov.pl/bdl/start

The association part is done on a basis of Kaggle dataset & BDL.

Source: https://www.kaggle.com/datasets/marcinzwonik/poland-population-by-province-2001-2021

The analysis is done with the help of gemini.

Data Preparation

-----------> INCLUDE PACKAGES <----------

library(dplyr)
library(tidyr)
library(stringr)
library(sf)
library(ggplot2)

-----------> LOAD MAP <-----------

mapa_sf <- st_read(
  "data/wojewodztwa/wojewodztwa.dbf",
  quiet = TRUE,
  options = "ENCODING=UTF-8"
)

mapa_leaflet <- mapa_sf %>%
  st_transform(4326) %>%
  mutate(
    Region_Key = str_sub(JPT_KOD_JE, 1, 2),
    Region = iconv(JPT_NAZWA_, from = "", to = "UTF-8")
  )

-----------> LOAD & CLEAN BDL <-----------

bdl_raw <- read.table(
  "data/RYNE_1674_CTAB_20251110125829.csv",
  header = TRUE,
  sep = ";",
  dec = ",",
  fileEncoding = "UTF-8",
  stringsAsFactors = FALSE
) %>%
  select(-last_col())

bdl_raw$Nazwa <- iconv(bdl_raw$Nazwa, from = "WINDOWS-1250", to = "UTF-8")

year_cols <- 4:ncol(bdl_raw)
years <- 2006:(2006 + length(year_cols) - 1)
names(bdl_raw)[year_cols] <- paste0("Year_", years)

bdl_long <- bdl_raw %>%
  filter(!str_detect(Nazwa, "POLSKA")) %>%
  pivot_longer(
    cols = starts_with("Year_"),
    names_to = "Year_txt",
    values_to = "Production_GWh",
    values_transform = list(Production_GWh = as.numeric)
  ) %>%
  mutate(
    Year = as.numeric(str_remove(Year_txt, "Year_")),
    Region_Key = str_sub(Kod, 1, 2)
  ) %>%
  filter(!is.na(Production_GWh)) %>%
  select(Region_Key, Region = Nazwa, Year, Production_GWh)

-----------> JOIN MAP & DATA <-----------

map_data <- mapa_leaflet %>%
  left_join(bdl_long, by = "Region_Key")

saveRDS(map_data, "map_data.rds")
saveRDS(bdl_long, "bdl_long.rds")

-----------> ADD A LABEL COLUMN (VOIVODESHIP NAMES) <-----------
  
mapa_leaflet <- mapa_leaflet %>%
  mutate(Region_Label = Region)

-----------> PLOT THE MAP <-----------

ggplot(mapa_leaflet) +
  geom_sf(fill = "lightblue", color = "black") +  # polygons with borders
  geom_sf_text(aes(label = Region_Label), size = 3) +  # add names
  labs(title = "Polish Voivodeships") +
  theme_minimal() +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank()
  )
The map of Poland consists of 16 voivodeships.

The map of Poland consists of 16 voivodeships.

Dimension Reduction and Clustering

-----------> PACKAGES <-----------

library(readr)
library(dplyr)
library(tidyr)
library(factoextra)

-----------> LOAD DATA <-----------

bdl_long <- readRDS("bdl_long.rds")

-----------> CONVERT LONG TO WIDE FORMAT FOR PCA/CLUSTERING <-----------
wide <- bdl_long %>%
  pivot_wider(
    names_from = Year,
    values_from = Production_GWh
  )

-----------> SET ROW NAMES AND REMOVE METADATA COLUMNS <-----------
row.names(wide) <- wide$Region
wide <- wide[, -c(1,2)]  # Assuming columns 1 and 2 are Region and other metadata

-----------> SCALE THE DATA <-----------
data_scaled <- scale(wide)

-----------> PCA <-----------
pca <- prcomp(data_scaled)

-----------> SCREE PLOT <-----------
jpeg("pca_scree_plot.jpg", width = 800, height = 600)
fviz_eig(pca, addlabels = TRUE, ylim = c(0, 100)) +
  labs(title = "Scree Plot: Explained Variance by Principal Components")
dev.off()
'A common method for determining the number of PCs to be retained is a graphical representation known as a scree plot',     source: https://www.sciencedirect.com/topics/mathematics/scree-plot

‘A common method for determining the number of PCs to be retained is a graphical representation known as a scree plot’, source: https://www.sciencedirect.com/topics/mathematics/scree-plot

-----------> PCA BIPLOT <-----------
jpeg("pca_biplot.jpg", width = 800, height = 800)
fviz_pca_biplot(
  pca,
  repel = TRUE,
  col.var = "#2E9FDF",
  col.ind = "#696969",
  title = "PCA Biplot of Standardized Renewable Energy Data"
)
dev.off()
'A biplot is the simultaneous representation of rows and columns of a rectangular dataset. It is the generalization of a scatterplot to the case of mutlivariate data: it allows to visualize as much information as possible in a single graph (Greenacre, 2010).' source: https://cran.r-project.org/web/packages/dimensio/vignettes/pca.html

‘A biplot is the simultaneous representation of rows and columns of a rectangular dataset. It is the generalization of a scatterplot to the case of mutlivariate data: it allows to visualize as much information as possible in a single graph (Greenacre, 2010).’ source: https://cran.r-project.org/web/packages/dimensio/vignettes/pca.html

-----------> K-MEANS CLUSTERING <-----------
set.seed(42)
km <- kmeans(data_scaled, centers = 3, nstart = 25)

----------->K-means cluster plot <-----------
jpeg("kmeans_cluster.jpg", width = 800, height = 600)
fviz_cluster(km, data = data_scaled,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
             geom = "point",
             ellipse.type = "convex",
             ggtheme = theme_minimal(),
             main = "K-Means Clustering (k=3)")
dev.off()
'K-means is an iterative, centroid-based clustering algorithm that partitions a dataset into similar groups based on the distance between their centroids. The centroid, or cluster center, is either the mean or median of all the points within the cluster depending on the characteristics of the data.' source: https://www.ibm.com/think/topics/k-means-clustering

‘K-means is an iterative, centroid-based clustering algorithm that partitions a dataset into similar groups based on the distance between their centroids. The centroid, or cluster center, is either the mean or median of all the points within the cluster depending on the characteristics of the data.’ source: https://www.ibm.com/think/topics/k-means-clustering

-----------> HIERARCHICAL CLUSTERING <-----------
hc <- hclust(dist(data_scaled), method = "ward.D2")

-----------> Dendrogram <-----------
jpeg("hierarchical_dendrogram.jpg", width = 800, height = 600)
plot(hc, hang = -1, main = "Hierarchical Clustering Dendrogram")
dev.off()
'A dendrogram is a visual representation that illustrates the arrangement of clusters and their relationships with each other. The height of the branches in a dendrogram represents the distance or dissimilarity at which clusters merge. Lower heights indicate clusters joined at smaller distances, thus representing higher similarity.' source: https://www.datacamp.com/tutorial/hierarchical-clustering

‘A dendrogram is a visual representation that illustrates the arrangement of clusters and their relationships with each other. The height of the branches in a dendrogram represents the distance or dissimilarity at which clusters merge. Lower heights indicate clusters joined at smaller distances, thus representing higher similarity.’ source: https://www.datacamp.com/tutorial/hierarchical-clustering

Association

## Association Rule Mining
library(dplyr)
library(tidyr)
library(stringr)
library(arules)
library(arulesViz)
if(!require(htmlwidgets)) install.packages("htmlwidgets")
library(htmlwidgets)
library(htmltools)
if(!require(RColorBrewer)) install.packages("RColorBrewer")
library(RColorBrewer)
#-----------> 1. LOAD DATA ----------
energy_raw <- read.csv("data/RYNE_1674_CTAB_20251110125829.csv", 
                       sep = ";", dec = ",", 
                       check.names = FALSE, 
                       stringsAsFactors = FALSE)
energy_raw <- energy_raw[, colnames(energy_raw) != ""] # Remove empty columns

pop_raw <- read.csv("data/population_by_region_combined_pl.csv", 
                    check.names = FALSE, 
                    stringsAsFactors = FALSE)

#-----------> 2. CLEAN & PREPROCESS ----------
# Energy cleaning
energy_clean <- energy_raw %>%
  filter(Nazwa != "POLSKA") %>% 
  select(-Kod, -starts_with("Unnamed")) %>%
  pivot_longer(cols = -Nazwa, names_to = "Year_Raw", values_to = "Prod_GWh") %>%
  mutate(Year = as.integer(str_extract(Year_Raw, "\\d{4}")), Region = Nazwa) %>%
  select(Region, Year, Prod_GWh)

# Population cleaning
pop_clean <- pop_raw %>%
  select(Wojewodztwo, Rok, `ogółem ogółem`) %>%
  rename(Region = Wojewodztwo, Year = Rok, Pop = `ogółem ogółem`)

# Merge
bdl_combined <- inner_join(energy_clean, pop_clean, by = c("Region", "Year"))

# Fix Garbled Polish Characters
garbled_to_correct <- c("DOLNOĹšLÄ„SKIE" = "DOLNOŚLĄSKIE", "ĹšLÄ„SKIE" = "ŚLĄSKIE")
bdl_combined$Region <- ifelse(bdl_combined$Region %in% names(garbled_to_correct),
                              garbled_to_correct[bdl_combined$Region], bdl_combined$Region)

#-----------> 3. DISCRETIZE (NUMERIC INTERVALS ONLY) ----------
bdl_trans_df <- bdl_combined %>%
  mutate(
    Region = as.factor(Region),
    Population = cut(Pop, 
                     breaks = quantile(Pop, probs = seq(0, 1, 0.25), na.rm = TRUE),
                     include.lowest = TRUE, dig.lab = 5),
    Production = cut(Prod_GWh, 
                     breaks = quantile(Prod_GWh, probs = seq(0, 1, 0.25), na.rm = TRUE),
                     include.lowest = TRUE, dig.lab = 5)
  ) %>%
  select(Region, Population, Production)

#-----------> 4. GENERATE RULES ----------
transactions <- as(bdl_trans_df, "transactions")

# Setting support lower to catch specific Region + Population combinations
rules <- apriori(
  transactions,
  parameter = list(
    supp = 0.02,    
    conf = 0.5,     # Balance between strictness and quantity
    minlen = 2,
    maxlen = 3      # Limits rules to (Region + Pop) => Production
  ),
  control = list(verbose = FALSE)
)

#-----------> 5. FILTER FOR 32 RELATIONS ----------
# We target rules where Production is the result (RHS)
rules_filtered <- subset(rules, rhs %pin% "Production")

# Sort by lift and select the top 32
rules_top32 <- head(sort(rules_filtered, by = "lift", decreasing = TRUE), 32)

# Verify count in console
cat("Selected", length(rules_top32), "strong associations between Regions, Population, and Production.\n")
inspect(rules_top32)

#-----------> 6. VISUALIZE IN SINGLE PLOT ----------
association_plot <- plot(
  rules_top32,
  method = "graph",
  engine = "htmlwidget",
  shading = "lift"
)

# Display in R Markdown
association_plot

# Save to file
htmlwidgets::saveWidget(association_plot, "association_32_relations.html", selfcontained = TRUE)


#-----------> 5. FILTER & LABEL 32 RELATIONS ----------
# We target rules where Production is the result (RHS)
rules_filtered <- subset(rules, rhs %pin% "Production")

# Sort by lift and select the top 32
rules_top32 <- head(sort(rules_filtered, by = "lift", decreasing = TRUE), 32)

# Create a data frame version to add custom "Rule Names"
rules_df <- as(rules_top32, "data.frame") %>%
  mutate(
    Rule_ID = paste0("Rule_", row_number()),
    Meaningful_Name = paste("Pattern:", str_remove_all(rules, "[{}]"))
  )

# Verify with names in console
print(rules_df[, c("Rule_ID", "Meaningful_Name", "lift")])

#-----------> 6. VISUALIZE WITH LABELS ----------
# To show names in the graph, we can use the 'itemLabels' or simply 
# the default labels which represent the logic.

association_plot <- plot(
  rules_top32, 
  method = "graph", 
  engine = "htmlwidget",
  shading = "lift",
  # This control list helps customize the interactive widget
  control = list(
    nodeCol = brewer.pal(9, "Greens"),
    edgeCol = "gray",
    precision = 3
  )
)

library(DT)
rule_table <- datatable(rules_df, 
                        caption = 'Top 32 Logical Relations: Region & Population vs Energy Production',
                        colnames = c('ID', 'Logic', 'Support', 'Confidence', 'Coverage', 'Lift', 'Count', 'Label'),
                        options = list(pageLength = 10))

# Display
rule_table
association_plot
## Warning: `includeHTML()` was provided a `path` that appears to be a complete HTML document.
## ✖ Path: energy_association_rules_32.html
## ℹ Use `tags$iframe()` to include an HTML document. You can either ensure `path` is accessible in your app or document (see e.g. `shiny::addResourcePath()`) and pass the relative path to the `src` argument. Or you can read the contents of `path` and pass the contents to `srcdoc`.
visNetwork

‘Darker Green nodes indicate higher Lift (meaning the relationship is very strong and not just due to random chance). Lighter Green nodes indicate lower lift but still meeting your 0.5 confidence threshold.’

Shiny App

-----------> PACKAGES <-----------

library(shiny)
library(sf)
library(leaflet)
library(dplyr)
library(plotly)
library(tidyr)
library(stringr)


-----------> LOAD MAP DATA <-----------

mapa_sf <- st_read(
  "data/wojewodztwa/wojewodztwa.dbf",
  quiet = TRUE,
  options = "ENCODING=UTF-8"
)

mapa_leaflet <- mapa_sf %>%
  st_transform(4326) %>%
  mutate(
    Klucz_TERYT = str_sub(JPT_KOD_JE, 1, 2),
    Wojewodztwo = iconv(JPT_NAZWA_, from = "", to = "UTF-8")
  )

-----------> LOAD BDL DATA <-----------
  
dane_bdl_raw <- read.table(
  "data/RYNE_1674_CTAB_20251110125829.csv",
  header = TRUE,
  sep = ";",
  dec = ",",
  fileEncoding = "UTF-8",
  stringsAsFactors = FALSE
) %>%
  select(-last_col())

dane_bdl_raw$Nazwa <- iconv(dane_bdl_raw$Nazwa, from = "", to = "UTF-8")

-----------> RENAME THE COLUMNS <-----------
lata_idx <- 4:ncol(dane_bdl_raw)
lata <- 2006:(2006 + length(lata_idx) - 1)
names(dane_bdl_raw)[lata_idx] <- paste0("Rok_", lata)

-----------> LONG FORMAT <-----------
dane_dlugie_bdl <- dane_bdl_raw %>%
  filter(!str_detect(Nazwa, "POLSKA")) %>%
  pivot_longer(
    cols = starts_with("Rok_"),
    names_to = "Rok_txt",
    values_to = "Produkcja_GWh",
    values_transform = list(Produkcja_GWh = as.numeric)
  ) %>%
  mutate(
    Rok = as.numeric(str_remove(Rok_txt, "Rok_")),
    Klucz_TERYT = str_sub(Kod, 1, 2)
  ) %>%
  filter(!is.na(Produkcja_GWh))

-----------> UI <-----------
ui <- fluidPage(
  titlePanel("Renewable Energy Generation in Polish Voivodeships"),
  
  sidebarLayout(
    sidebarPanel(
      sliderInput(
        "rok",
        "Choose a year:",
        min = min(dane_dlugie_bdl$Rok),
        max = max(dane_dlugie_bdl$Rok),
        value = max(dane_dlugie_bdl$Rok),
        step = 1,
        sep = "",
        animate = animationOptions(interval = 1000, loop = TRUE)
      ),
      plotlyOutput("wykres_liniowy", height = 250),
      plotlyOutput("wykres_slupkowy", height = 300)
    ),
    mainPanel(
      leafletOutput("mapa", height = 700)
    )
  )
)

-----------> SERVER <-----------

server <- function(input, output, session) {
  
  # --- Aggregate data for map by Klucz_TERYT
  dane_mapy <- reactive({
    dane_dlugie_bdl %>%
      filter(Rok == input$rok) %>%
      group_by(Klucz_TERYT) %>%
      summarise(Produkcja_GWh = sum(Produkcja_GWh), .groups = "drop") %>%
      left_join(mapa_leaflet, by = "Klucz_TERYT") %>%
      st_as_sf()
  })
  
  # --- Color palette
  pal <- reactive({
    colorNumeric(
      "YlGnBu",
      domain = dane_mapy()$Produkcja_GWh,
      na.color = "#f0f0f0"
    )
  })
  
-----------> INITIAL MAP <-----------
  output$mapa <- renderLeaflet({
    leaflet() %>%
      addTiles() %>%
      setView(19.5, 52, 6)
  })
  
-----------> UPDATED MAP <-----------
  observe({
    leafletProxy("mapa", data = dane_mapy()) %>%
      clearShapes() %>%
      clearControls() %>%
      addPolygons(
        fillColor = ~pal()(Produkcja_GWh),
        color = "white",
        weight = 1,
        fillOpacity = 0.8,
        layerId = ~Klucz_TERYT,
        label = ~paste0(
          Wojewodztwo, "<br>",
          "Production: ",
          format(round(Produkcja_GWh, 1), big.mark = " "),
          " GWh"
        ),
        highlightOptions = highlightOptions(
          weight = 3,
          color = "black",
          bringToFront = TRUE
        )
      ) %>%
      addLegend(
        pal = pal(),
        values = ~Produkcja_GWh,
        title = paste("RES Production (", input$rok, ")", sep = ""),
        position = "bottomright"
      )
  })
  
-----------> LINE CHART (AGGREGATED PER TERYT) <-----------
  output$wykres_liniowy <- renderPlotly({
    req(input$mapa_shape_click)
    
    dane <- dane_dlugie_bdl %>%
      filter(Klucz_TERYT == input$mapa_shape_click$id) %>%
      group_by(Rok) %>%
      summarise(Produkcja_GWh = sum(Produkcja_GWh), .groups = "drop")
    
    region_name <- mapa_leaflet$Wojewodztwo[mapa_leaflet$Klucz_TERYT == input$mapa_shape_click$id][1]
    
    plot_ly(
      dane,
      x = ~Rok,
      y = ~Produkcja_GWh,
      type = "scatter",
      mode = "lines+markers",
      line = list(width = 3)
    ) %>%
      layout(
        title = region_name,
        xaxis = list(title = "Year"),
        yaxis = list(title = "Energy Generation [GWh]")
      )
  })
  
-----------> BAR CHART PER YEAR <-----------
  output$wykres_slupkowy <- renderPlotly({
    dane_bar <- dane_dlugie_bdl %>%
      filter(Rok == input$rok) %>%
      group_by(Nazwa) %>%
      summarise(Produkcja_GWh = sum(Produkcja_GWh), .groups = "drop")
    
    plot_ly(
      dane_bar,
      x = ~reorder(Nazwa, Produkcja_GWh),
      y = ~Produkcja_GWh,
      type = "bar"
    ) %>%
      layout(
        title = paste("RES generation in", input$rok),
        xaxis = list(title = "", tickangle = -45),
        yaxis = list(title = "Generation [GWh]")
      )
  })
}

-----------> RUN APP <-----------
shinyApp(ui, server)