library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
data <- read.csv("https://raw.githubusercontent.com/farhodibr/CUNY-SPS-MSDS/refs/heads/main/DATA607/PROJECT2/Cola.csv")
head(data)
## Data.provided.by.SimFin X X.1 X.2 X.3 X.4
## 1
## 2 Profit & Loss statement
## 3 in million USD FY '09 FY '10 FY '11 FY '12
## 4 NET OPERATING REVENUES 30,990 35,119 46,542 48,017
## 5 Cost of goods sold 11,088 12,693 18,215 19,053
## 6 Gross Profit 19,902 22,426 28,327 28,964
## X.5 X.6 X.7 X.8 X.9 X.10
## 1
## 2
## 3 FY '13 FY '14 FY '15 FY '16 FY '17 FY '18
## 4 46,854 45,998 44,294 41,863 35,410 31,856
## 5 18,421 17,889 17,482 16,465 13,255 11,770
## 6 28,433 28,109 26,812 25,398 22,155 20,086
Why is this dataset untidy:
Multiple variables in separate rows: The dataset contains various variables that are scattered across rows.
Fiscal years as columns
This dataset is big and could use a lot of tidying and broken down into separate tidy data tables. I will just use Profit & Loss statement to create tidy data table as example.
profit_data <- data |>
slice(4:6) |>
select(!Data.provided.by.SimFin)
print(profit_data)
## X X.1 X.2 X.3 X.4 X.5 X.6 X.7
## 1 NET OPERATING REVENUES 30,990 35,119 46,542 48,017 46,854 45,998 44,294
## 2 Cost of goods sold 11,088 12,693 18,215 19,053 18,421 17,889 17,482
## 3 Gross Profit 19,902 22,426 28,327 28,964 28,433 28,109 26,812
## X.8 X.9 X.10
## 1 41,863 35,410 31,856
## 2 16,465 13,255 11,770
## 3 25,398 22,155 20,086
Here I am cleaning and formating rows and column names:
colnames(profit_data) <- tolower(gsub(" ", "_", colnames(profit_data)))
profit_data$x <- tolower(gsub(" ", "_", profit_data$x))
profit_data <- profit_data |>
mutate(across(where(is.character), ~ tolower(.)))
profit_data <- profit_data |>
rename( profit_loss = x)
print(profit_data)
## profit_loss x.1 x.2 x.3 x.4 x.5 x.6 x.7
## 1 net_operating_revenues 30,990 35,119 46,542 48,017 46,854 45,998 44,294
## 2 cost_of_goods_sold 11,088 12,693 18,215 19,053 18,421 17,889 17,482
## 3 gross_profit 19,902 22,426 28,327 28,964 28,433 28,109 26,812
## x.8 x.9 x.10
## 1 41,863 35,410 31,856
## 2 16,465 13,255 11,770
## 3 25,398 22,155 20,086
This code first create a long table, and after I am making it wide to make it tidy:
profit_data_tidy <- profit_data |>
pivot_longer(
col = starts_with("x"),
names_to = "period",
values_to = "amount"
) |>
group_by(profit_loss, period)
head(profit_data_tidy)
## # A tibble: 6 × 3
## # Groups: profit_loss, period [6]
## profit_loss period amount
## <chr> <chr> <chr>
## 1 net_operating_revenues x.1 30,990
## 2 net_operating_revenues x.2 35,119
## 3 net_operating_revenues x.3 46,542
## 4 net_operating_revenues x.4 48,017
## 5 net_operating_revenues x.5 46,854
## 6 net_operating_revenues x.6 45,998
profit_data_tidy$amount <- as.numeric(gsub(",", "", profit_data_tidy$amount))
profit_data_tidy$period <- as.numeric(gsub("x.", "", profit_data_tidy$period)) + 2000
profit_data_tidy <- profit_data_tidy |>
pivot_wider(
names_from = profit_loss,
values_from = amount
)
head(profit_data_tidy)
## # A tibble: 6 × 4
## # Groups: period [6]
## period net_operating_revenues cost_of_goods_sold gross_profit
## <dbl> <dbl> <dbl> <dbl>
## 1 2001 30990 11088 19902
## 2 2002 35119 12693 22426
## 3 2003 46542 18215 28327
## 4 2004 48017 19053 28964
## 5 2005 46854 18421 28433
## 6 2006 45998 17889 28109
Why is this dataset is tidy?
Each row represents a single observation which corresponds to a specific period (year 2001, 2002, etc.), making it easy to analyze trends or patterns over time.
The dataset has separate columns for period,
net_operating_revenues, cost_of_goods_sold,
and gross_profit, which allows each variable to be analyzed
independently.
Here is the plot showing financial performance over time:
#profit_data_tidy$period <- as.factor(profit_data_tidy$period)
profit_data_tidy$net_operating_revenues <- as.numeric(profit_data_tidy$net_operating_revenues)
profit_data_tidy$cost_of_goods_sold <- as.numeric(profit_data_tidy$cost_of_goods_sold)
profit_data_tidy$gross_profit <- as.numeric(profit_data_tidy$gross_profit)
ggplot(profit_data_tidy, aes(x = period)) +
geom_line(aes(y = net_operating_revenues,
color = "Net Operating Revenues", group = 1),
linewidth = 1) +
geom_line(aes(y = cost_of_goods_sold,
color = "Cost of Goods Sold", group = 1),
linewidth = 1) +
geom_line(aes(y = gross_profit,
color = "Gross Profit", group = 1),
linewidth = 1) +
labs(title = "Trend Analysis: Financial Performance Over Time",
x = "Period",
y = "Amount (in USD)",
color = "Legend") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
From this plot I can say:
This plot analyze profit margin over the years
profit_data_tidy <- profit_data_tidy |>
mutate(gross_profit_margin = (gross_profit / net_operating_revenues) * 100)
profit_data_tidy$period <- as.numeric(as.character(profit_data_tidy$period))
print(profit_data_tidy)
## # A tibble: 10 × 5
## # Groups: period [10]
## period net_operating_revenues cost_of_goods_sold gross_profit
## <dbl> <dbl> <dbl> <dbl>
## 1 2001 30990 11088 19902
## 2 2002 35119 12693 22426
## 3 2003 46542 18215 28327
## 4 2004 48017 19053 28964
## 5 2005 46854 18421 28433
## 6 2006 45998 17889 28109
## 7 2007 44294 17482 26812
## 8 2008 41863 16465 25398
## 9 2009 35410 13255 22155
## 10 2010 31856 11770 20086
## # ℹ 1 more variable: gross_profit_margin <dbl>
ggplot(profit_data_tidy, aes(x = period, y = gross_profit_margin)) +
geom_point(color = "red") +
geom_line(color = "blue", linewidth = 1) +
labs(title = "Gross Profit Margin Over Time",
x = "Period", y = "Gross Profit Margin (%)") +
scale_x_continuous(breaks = seq(min(profit_data_tidy$period), max(profit_data_tidy$period), by = 1),
labels = as.character(seq(min(profit_data_tidy$period), max(profit_data_tidy$period), by = 1))) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Profit margin declined significantly in years of 2002-2004, which is interesting. Because from the previous plot we can see that revenues peak is in 2004. As we can see from 2007 profit margin steadily increasing.