Tidying “Cola” dataset

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
data <- read.csv("https://raw.githubusercontent.com/farhodibr/CUNY-SPS-MSDS/refs/heads/main/DATA607/PROJECT2/Cola.csv")
head(data)
##   Data.provided.by.SimFin                      X    X.1    X.2    X.3    X.4
## 1                                                                           
## 2 Profit & Loss statement                                                   
## 3                                 in million USD FY '09 FY '10 FY '11 FY '12
## 4                         NET OPERATING REVENUES 30,990 35,119 46,542 48,017
## 5                             Cost of goods sold 11,088 12,693 18,215 19,053
## 6                                   Gross Profit 19,902 22,426 28,327 28,964
##      X.5    X.6    X.7    X.8    X.9   X.10
## 1                                          
## 2                                          
## 3 FY '13 FY '14 FY '15 FY '16 FY '17 FY '18
## 4 46,854 45,998 44,294 41,863 35,410 31,856
## 5 18,421 17,889 17,482 16,465 13,255 11,770
## 6 28,433 28,109 26,812 25,398 22,155 20,086

Why is this dataset untidy:

This dataset is big and could use a lot of tidying and broken down into separate tidy data tables. I will just use Profit & Loss statement to create tidy data table as example.

profit_data <- data |>
  slice(4:6) |>
  select(!Data.provided.by.SimFin)
print(profit_data)
##                        X    X.1    X.2    X.3    X.4    X.5    X.6    X.7
## 1 NET OPERATING REVENUES 30,990 35,119 46,542 48,017 46,854 45,998 44,294
## 2     Cost of goods sold 11,088 12,693 18,215 19,053 18,421 17,889 17,482
## 3           Gross Profit 19,902 22,426 28,327 28,964 28,433 28,109 26,812
##      X.8    X.9   X.10
## 1 41,863 35,410 31,856
## 2 16,465 13,255 11,770
## 3 25,398 22,155 20,086

Here I am cleaning and formating rows and column names:

colnames(profit_data) <- tolower(gsub(" ", "_", colnames(profit_data)))
profit_data$x <- tolower(gsub(" ", "_", profit_data$x))

profit_data <- profit_data |>
  mutate(across(where(is.character), ~ tolower(.)))

profit_data <- profit_data |>
  rename( profit_loss = x)
         
print(profit_data)
##              profit_loss    x.1    x.2    x.3    x.4    x.5    x.6    x.7
## 1 net_operating_revenues 30,990 35,119 46,542 48,017 46,854 45,998 44,294
## 2     cost_of_goods_sold 11,088 12,693 18,215 19,053 18,421 17,889 17,482
## 3           gross_profit 19,902 22,426 28,327 28,964 28,433 28,109 26,812
##      x.8    x.9   x.10
## 1 41,863 35,410 31,856
## 2 16,465 13,255 11,770
## 3 25,398 22,155 20,086

This code first create a long table, and after I am making it wide to make it tidy:

profit_data_tidy <- profit_data |>
  pivot_longer(
    col = starts_with("x"),
    names_to = "period",
    values_to = "amount"
  ) |>
  group_by(profit_loss, period)
head(profit_data_tidy)
## # A tibble: 6 × 3
## # Groups:   profit_loss, period [6]
##   profit_loss            period amount
##   <chr>                  <chr>  <chr> 
## 1 net_operating_revenues x.1    30,990
## 2 net_operating_revenues x.2    35,119
## 3 net_operating_revenues x.3    46,542
## 4 net_operating_revenues x.4    48,017
## 5 net_operating_revenues x.5    46,854
## 6 net_operating_revenues x.6    45,998
profit_data_tidy$amount <- as.numeric(gsub(",", "", profit_data_tidy$amount))
profit_data_tidy$period <- as.numeric(gsub("x.", "", profit_data_tidy$period)) + 2000

profit_data_tidy <- profit_data_tidy |>
  pivot_wider(
    names_from = profit_loss,
    values_from = amount
  )
head(profit_data_tidy)
## # A tibble: 6 × 4
## # Groups:   period [6]
##   period net_operating_revenues cost_of_goods_sold gross_profit
##    <dbl>                  <dbl>              <dbl>        <dbl>
## 1   2001                  30990              11088        19902
## 2   2002                  35119              12693        22426
## 3   2003                  46542              18215        28327
## 4   2004                  48017              19053        28964
## 5   2005                  46854              18421        28433
## 6   2006                  45998              17889        28109

Why is this dataset is tidy?

Analysis

Here is the plot showing financial performance over time:

#profit_data_tidy$period <- as.factor(profit_data_tidy$period)

profit_data_tidy$net_operating_revenues <- as.numeric(profit_data_tidy$net_operating_revenues)
profit_data_tidy$cost_of_goods_sold <- as.numeric(profit_data_tidy$cost_of_goods_sold)
profit_data_tidy$gross_profit <- as.numeric(profit_data_tidy$gross_profit)

ggplot(profit_data_tidy, aes(x = period)) + 
  geom_line(aes(y = net_operating_revenues, 
                color = "Net Operating Revenues", group = 1), 
                linewidth = 1) + 
  geom_line(aes(y = cost_of_goods_sold, 
                color = "Cost of Goods Sold", group = 1), 
                linewidth = 1) +
  geom_line(aes(y = gross_profit, 
                color = "Gross Profit", group = 1), 
                linewidth = 1) +
  labs(title = "Trend Analysis: Financial Performance Over Time",
       x = "Period", 
       y = "Amount (in USD)",
       color = "Legend") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

From this plot I can say:

This plot analyze profit margin over the years

profit_data_tidy <- profit_data_tidy |>
  mutate(gross_profit_margin = (gross_profit / net_operating_revenues) * 100)

profit_data_tidy$period <- as.numeric(as.character(profit_data_tidy$period))

print(profit_data_tidy)
## # A tibble: 10 × 5
## # Groups:   period [10]
##    period net_operating_revenues cost_of_goods_sold gross_profit
##     <dbl>                  <dbl>              <dbl>        <dbl>
##  1   2001                  30990              11088        19902
##  2   2002                  35119              12693        22426
##  3   2003                  46542              18215        28327
##  4   2004                  48017              19053        28964
##  5   2005                  46854              18421        28433
##  6   2006                  45998              17889        28109
##  7   2007                  44294              17482        26812
##  8   2008                  41863              16465        25398
##  9   2009                  35410              13255        22155
## 10   2010                  31856              11770        20086
## # ℹ 1 more variable: gross_profit_margin <dbl>
ggplot(profit_data_tidy, aes(x = period, y = gross_profit_margin)) +
  geom_point(color = "red") + 
  geom_line(color = "blue", linewidth = 1) +
  labs(title = "Gross Profit Margin Over Time",
       x = "Period", y = "Gross Profit Margin (%)") +
  scale_x_continuous(breaks = seq(min(profit_data_tidy$period), max(profit_data_tidy$period), by = 1),
                     labels = as.character(seq(min(profit_data_tidy$period), max(profit_data_tidy$period), by = 1))) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Profit margin declined significantly in years of 2002-2004, which is interesting. Because from the previous plot we can see that revenues peak is in 2004. As we can see from 2007 profit margin steadily increasing.