Load packages

pacman::p_load(
  rio,          
  here,         
  tidyverse,
  lubridate,
  plotly,
  gganimate,
  ggExtra,      
  ggalt,      
  ggcorrplot,
  ggpubr,
  ggplot2, # ggarrange()
  zoo,
  data.table, 
  formattable,
  tidyr,
  ggpmisc, # stat_valey
  ggalluvial, # sankey chart
  RColorBrewer,  # color pattlet 
  CGPfunctions, # slop graph
  ggrepel, # avoid overlap points
  ggthemes,
  viridis
  )

Data Visualisation

In this project, I am going to use the open data named Premium unleaded petrol prices excluding tax and duty compared with EU countries which is published by GOV.UK

Import data

mydata <- import(here("data", "table_511_and_521.xlsx"), which = "5.1.1 (excl tax)", skip = 7)  

Data cleaning

# Price excl tax 
data <- mydata %>% janitor::clean_names()   # clean cols name 
colnames(data)[18] <- 'UK' # rename 

# remove NA value 
data <- data[complete.cases(data), ]

# remove unwanted cols 
data$month <- NULL
data$day_in_month_of_price_snapshot <- NULL
data$uk_rank_in_eu14_plus_uk <- NULL
data$uk_rank_in_eu27_plus_uk <- NULL

# change variable class 
data$year <- as.factor(data$year)
# calculate annual avg price (not weekly price)
data <- data %>% 
  group_by(year) %>% 
    summarise(across(2:28, ~ mean(.x, na.rm = TRUE)))

Data visualisation

Slope graph

# wide to long format 
dtslop <- data %>% 
  pivot_longer(
    cols = belgium:slovenia,
    names_to = "country",
    values_to = "price_excl_tax"
  )

# rounding value 
dtslop$price_excl_tax <- round(dtslop$price_excl_tax, 1)

# class check 
str(dtslop)
## tibble [297 × 3] (S3: tbl_df/tbl/data.frame)
##  $ year          : Factor w/ 11 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ country       : chr [1:297] "belgium" "denmark" "finland" "france" ...
##  $ price_excl_tax: num [1:297] 58.2 63.4 58.7 55.2 58 57.8 57.2 59.9 57.9 56.8 ...

Using newggslopegraph {CGPfunctions}

# “econ” theme
t <- dtslop %>% filter(year %in% c("2019", "2020", "2021", "2022")) # country has chr class so must apply dplyr %in%, not ==

CGPfunctions::newggslopegraph(t, year, price_excl_tax, country,
                Title = "Premium unleaded petrol prices excluding tax and duty",
                SubTitle = "2019-2023",
                Caption = "By R CHARTS",
                CaptionJustify = "left",
                ThemeChoice = "econ",
                DataTextSize = 2.5,
                DataLabelPadding = .05)
## 
## Converting 'year' to an ordered factor
## Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# “wsj” theme
t <- t %>% filter(country %in% c("UK", "sweden", "france", "netherlands", "belgium", "luxembourg", "germany", "greece", "malta", "italy", "findland")) 

CGPfunctions::newggslopegraph(t, year, price_excl_tax, country,
                Title = "Premium unleaded petrol prices excluding tax and duty",
                SubTitle = "2019-2023",
                Caption = "By R CHARTS",
                CaptionJustify = "left",
                ThemeChoice = "wsj",
                DataTextSize = 2.5,
                DataLabelPadding = .05)
## 
## Converting 'year' to an ordered factor

Compare price excl - incl tax & duty

Import data

Here we import the data of Premium unleaded petrol prices excluding tax and duty compared with EU countries

# Price incl tax
mydata2 <- import(here("data", "table_511_and_521.xlsx"), which = "5.1.1 (incl tax)", skip = 7)

data2 <- mydata2 %>% janitor::clean_names()   # clean cols name 
colnames(data2)[18] <- 'UK' # rename 

data2 <- data2[complete.cases(data2), ]

data2$month <- NULL
data2$day_in_month_of_price_snapshot <- NULL
data2$uk_rank_in_eu14_plus_uk <- NULL
data2$uk_rank_in_eu27_plus_uk <- NULL

data2$year <- as.factor(data2$year)
# calculate annual avg price (not weekly price)
data2 <- data2 %>% 
  group_by(year) %>% 
  summarise(across(2:28, ~ mean(.x, na.rm = TRUE)))
# wide to long format 
dtslop2 <- data2 %>% 
  pivot_longer(
    cols = belgium:slovenia,
    names_to = "country",
    values_to = "price_incl_tax"
  )

# rounding value 
dtslop2$price_incl_tax <- round(dtslop2$price_incl_tax, 0)

# data cleaning 
dtam <- cbind(dtslop, dtslop2$price_incl_tax) 
colnames(dtam)[4] <- 'price_incl_tax'
dtam$price_incl_tax <- round(dtam$price_incl_tax, 1)

For the purpose of visualtion, we select data in 2022:

# select 1 year only 
b <- dtam %>% filter(year %in% c("2022")) 

# transform data from wide to long format 
c <- b %>% 
  pivot_longer(
    cols = price_excl_tax:price_incl_tax,
    names_to = "type",
    values_to = "price"
  ) %>% 
  select(-c(year))
# data viz
ggplot(c, aes(x = reorder(country,-price), y = price, fill = type)) + 
  geom_bar(stat = "identity") +
  scale_fill_brewer() +
  guides(fill = guide_legend(title = "Title")) +
  labs(title = "Pump Price of Unleaded Petrol as in 2022",
       x = " ",
       y = "Pence (p/litre)") +
  theme_classic() +
  coord_flip()

# manually fill
ggplot(c, aes(x = reorder(country,-price), y = price, fill = type)) + 
  geom_bar(stat = "identity") +
  guides(fill = guide_legend(title = "Title")) +
  labs(title = "Pump Price of Unleaded Petrol as in 2022",
       x = " ",
       y = "Pence (p/litre)") +
  scale_fill_manual(values = c(alpha("#33A02C",0.4), alpha("#33A02C", 1))) +  # ("#1F78B4", "#1F78B4")
  theme_classic() +
  coord_flip()

Sankey chart for ranking changes

Data preparation

data <- mydata %>% janitor::clean_names()   # clean cols name
data <- data[complete.cases(data), ]

data$month <- NULL
data$day_in_month_of_price_snapshot <- NULL
data$uk_rank_in_eu14_plus_uk <- NULL
data$uk_rank_in_eu27_plus_uk <- NULL

data$year <- as.factor(data$year)

data <- data %>%
  group_by(year) %>%
  summarise(across(2:28, ~ mean(.x, na.rm = TRUE)))
# Ranking the data 
# newdata <- data.frame(data, t(apply(-data[-1], 1, rank, ties.method='min', na.last='keep')))
dt_rank <- cbind(data[1], t(apply(-data[-1], 1, rank, ties.method='min', na.last='keep')))  
colnames(dt_rank)[15] <- 'UK' # rename 

# long to wide 
dt <- dt_rank %>% 
  pivot_longer(
    cols = belgium:slovenia,
    names_to = "country",
    values_to = "rank"
  )

# changing variable class (required for sankey chart)
dt$year <- as.character(dt$year)
dt$country <- as.character(dt$country)
dt$rank <- as.factor(dt$rank)
str(dt)
## tibble [297 × 3] (S3: tbl_df/tbl/data.frame)
##  $ year   : chr [1:297] "2013" "2013" "2013" "2013" ...
##  $ country: chr [1:297] "belgium" "denmark" "finland" "france" ...
##  $ rank   : Factor w/ 27 levels "1","2","3","4",..: 9 1 8 24 11 13 15 6 12 17 ...
# visualisation
ggplot(dt, aes(x = year, stratum = rank, alluvium = country, fill = country, label = country)) +
  geom_flow(stat = "alluvium", lode.guidance = "rightleft", color = "darkgray") +
  geom_stratum() +
  theme(legend.position = "bottom") +
  ggtitle("Fuel price rank changes") +
  scale_fill_viridis_d(direction = -1) +
  theme_classic()

For better visualisation, some nations are selected:

datas <- data %>% select(c(1:16)) # Only 15 nations are selected 

dt_rank <- cbind(datas[1], t(apply(-datas[-1], 1, rank, ties.method='min', na.last='keep')))  
colnames(dt_rank)[15] <- 'UK' # rename 

#
dt2 <- dt_rank %>% 
  pivot_longer(
    cols = belgium:bulgaria,
    names_to = "country",
    values_to = "rank"
  )

# changing variable class (required for sankey chart)
dt2$year <- as.character(dt2$year)
dt2$country <- as.character(dt2$country)
dt2$rank <- as.factor(dt2$rank)
str(dt2)
## tibble [165 × 3] (S3: tbl_df/tbl/data.frame)
##  $ year   : chr [1:165] "2013" "2013" "2013" "2013" ...
##  $ country: chr [1:165] "belgium" "denmark" "finland" "france" ...
##  $ rank   : Factor w/ 15 levels "1","2","3","4",..: 6 1 5 14 8 10 11 4 9 13 ...
ggplot(dt2, aes(x = year, stratum = rank, alluvium = country, fill = country, label = country)) +
  geom_flow(stat = "alluvium", lode.guidance = "rightleft", color = "darkgray") +
  geom_stratum(alpha = .8) +   # color gradient
  theme(legend.position = "bottom") +
  ggtitle("Fuel price rank changes") +
  scale_fill_viridis_d(direction = -1) +
  theme_classic() +
  geom_text(stat = "stratum", aes(label = rank)) 

Appendix

data <- mydata %>% janitor::clean_names()   # clean cols name
data$month <- as.Date(as.numeric(data$month), origin = "1899-12-30")
data$Month <- format(as.Date(data$month, format="%Y/%m/%d"),"%m")

# convert month in number into month abbreviation
data$Month <- as.numeric(data$Month)
data <- transform(data, MonthAbb = month.abb[Month])