#Source: https://t.me/rstudioprogr
library(ggplot2)
library(remotes)
library(dplyr)
library(ggsankey)

# Создаем переменные с названиями и количеством сегментов
t1 <- c(rep("Bundled Subscriptions", 5069), rep("CV Database Access", 3288), 
        rep("Job Postings", 7858), rep("Other", 1868))
t2 <- c(rep("Revenue", 18083))
t3 <- c(rep("Gross profit", 14942), rep("COGS", 3141))
t4 <- c(rep("Operating profit", 6921), rep("Operating expenses", 8021), rep(NA, 3141))
t5 <- c(rep("Income before taxes", 5235), rep("Non-operating expenses", 1636), rep(NA, 8021), rep(NA, 3141))
t6 <- c(rep("Net Income", 3691), rep("Taxes", 1544), rep(NA, 1636), rep(NA, 8021), rep(NA, 3141))

# Создаем data frame из переменных
d <- data.frame(cbind(t1,t2,t3, t4, t5, t6))
names(d) <- c('Segments', 'Revenue',  'Gross Profit', 'Operating Profit', 'Income before taxes', 'Net Income')
tail(d)
##       Segments Revenue Gross Profit Operating Profit Income before taxes
## 18078    Other Revenue         COGS             <NA> Income before taxes
## 18079    Other Revenue         COGS             <NA> Income before taxes
## 18080    Other Revenue         COGS             <NA> Income before taxes
## 18081    Other Revenue         COGS             <NA> Income before taxes
## 18082    Other Revenue         COGS             <NA> Income before taxes
## 18083    Other Revenue         COGS             <NA> Income before taxes
##       Net Income
## 18078 Net Income
## 18079 Net Income
## 18080 Net Income
## 18081 Net Income
## 18082 Net Income
## 18083 Net Income
# Считаем общее количество строк в данных
TotalCount = nrow(d)

# Преобразуем данные для построения Sankey диаграммы
df <- d %>%
  make_long(Segments, Revenue, 'Gross Profit', 'Operating Profit', 'Operating Profit', 'Income before taxes', 'Net Income')

# Группируем данные
dagg <- df%>% dplyr::group_by(node)%>% tally()
dagg <- dagg%>% dplyr::group_by(node)%>% dplyr::mutate(pct = n/TotalCount)

# Объединяем данные
df2 <- merge(df, dagg, by.x = 'node', by.y = 'node', all.x = TRUE)
tail(df2)
##        node                x              next_x next_node     n    pct
## 108493 <NA> Operating Profit Income before taxes      <NA> 27101 1.4987
## 108494 <NA> Operating Profit Income before taxes      <NA> 27101 1.4987
## 108495 <NA> Operating Profit Income before taxes      <NA> 27101 1.4987
## 108496 <NA> Operating Profit Income before taxes      <NA> 27101 1.4987
## 108497 <NA> Operating Profit Income before taxes      <NA> 27101 1.4987
## 108498 <NA> Operating Profit Income before taxes      <NA> 27101 1.4987
# Создаем график Sankey
pl <- ggplot(df2 %>% filter(node != "NA"), aes(x = x, next_x = next_x, node = node, next_node = next_node,
                                               fill = factor(node), 
                                               label = paste0(node,"=", n, '(',  round(pct* 100,1), '%)' ))
)

pl <- pl + geom_sankey(flow.alpha = 0.5,  color = "gray40", show.legend = TRUE)
pl <- pl + geom_sankey_label(size = 4, color = "black", fill= "white", vjust = -0.5)
pl <- pl + theme_bw()
pl <- pl + theme(legend.position = "none")
pl <- pl + theme(axis.title = element_blank(), 
                 axis.text.y = element_blank(), 
                 axis.ticks = element_blank(), 
                 panel.grid = element_blank())
pl <- pl + scale_fill_viridis_d(option = "magma")
pl <- pl + labs(title = "P&L Statement using sankey graph")
pl <- pl + labs(caption = "@rstudioprogr")
pl