This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

plot(cars)

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

# === Step 1: Install and load required packages ===
if (!require("readr")) install.packages("readr")
if (!require("dplyr")) install.packages("dplyr")
if (!require("ggplot2")) install.packages("ggplot2")

library(readr)
library(dplyr)
library(ggplot2)

# === Step 2: Load the dataset ===
# Make sure this CSV is in your working directory (use setwd() or upload in Posit Cloud)
df <- read_csv("/Users/erickwak/Desktop/rsconnect/synthetic_beverage_sales_data_100mb.csv")

# Check structure and first few rows
str(df)
spc_tbl_ [1,000,000 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ Order_ID     : chr [1:1000000] "ORD1" "ORD1" "ORD1" "ORD1" ...
 $ Customer_ID  : chr [1:1000000] "CUS1496" "CUS1496" "CUS1496" "CUS1496" ...
 $ Customer_Type: chr [1:1000000] "B2B" "B2B" "B2B" "B2B" ...
 $ Product      : chr [1:1000000] "Vio Wasser" "Evian" "Sprite" "Rauch Multivitamin" ...
 $ Category     : chr [1:1000000] "Water" "Water" "Soft Drinks" "Juices" ...
 $ Unit_Price   : num [1:1000000] 1.66 1.56 1.17 3.22 0.87 9.09 2.14 0.43 1.21 1.38 ...
 $ Quantity     : num [1:1000000] 53 90 73 59 35 2 44 13 92 3 ...
 $ Discount     : num [1:1000000] 0.1 0.1 0.05 0.1 0.1 0 0.1 0.05 0.1 0.05 ...
 $ Total_Price  : num [1:1000000] 79.2 126.4 81.1 171 27.4 ...
 $ Region       : chr [1:1000000] "Baden-Württemberg" "Baden-Württemberg" "Baden-Württemberg" "Baden-Württemberg" ...
 $ Order_Date   : Date[1:1000000], format: "2023-08-23" "2023-08-23" ...
 - attr(*, "spec")=
  .. cols(
  ..   Order_ID = col_character(),
  ..   Customer_ID = col_character(),
  ..   Customer_Type = col_character(),
  ..   Product = col_character(),
  ..   Category = col_character(),
  ..   Unit_Price = col_double(),
  ..   Quantity = col_double(),
  ..   Discount = col_double(),
  ..   Total_Price = col_double(),
  ..   Region = col_character(),
  ..   Order_Date = col_date(format = "")
  .. )
 - attr(*, "problems")=<externalptr> 
head(df)

# === Step 3: Compute total revenue per region ===
region_revenue <- df %>%
  group_by(Region) %>%
  summarise(Total_Revenue = sum(Total_Price, na.rm = TRUE)) %>%
  arrange(desc(Total_Revenue))

# View top 5 regions
top_regions <- head(region_revenue$Region, 5)
print("Top 5 regions by revenue:")
[1] "Top 5 regions by revenue:"
print(top_regions)
[1] "Hamburg"                "Hessen"                 "Mecklenburg-Vorpommern"
[4] "Saarland"               "Rheinland-Pfalz"       
# === Step 4: Filter data to include only the top 5 regions ===
top_region_data <- df %>%
  filter(Region %in% top_regions)

# === Step 5: Compute revenue by Region and Category ===
region_category_revenue <- top_region_data %>%
  group_by(Region, Category) %>%
  summarise(Category_Revenue = sum(Total_Price, na.rm = TRUE)) %>%
  ungroup()

# View the result
print("Sample of region_category_revenue:")
[1] "Sample of region_category_revenue:"
print(head(region_category_revenue))

# === Step 6: Plot revenue by beverage category in top 5 regions ===
ggplot(region_category_revenue, aes(x = Category_Revenue, y = Region, fill = Category)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Revenue by Beverage Category in Top 5 Regions",
    x = "Total Revenue",
    y = "Region"
  ) +
  theme_minimal()

# Load required packages
library(dplyr)
library(readr)

# Load data (skip if already loaded)
# df <- read_csv("synthetic_beverage_sales_data_100mb.csv")

# Summarize total revenue by region
region_revenue_table <- df %>%
  group_by(Region) %>%
  summarise(Total_Revenue = sum(Total_Price, na.rm = TRUE)) %>%
  arrange(desc(Total_Revenue)) %>%
  slice(1:5) %>%
  mutate(Rank = row_number()) %>%
  select(Rank, Region, Total_Revenue)

# Print table
print(region_revenue_table)
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKVGhpcyBpcyBhbiBbUiBNYXJrZG93bl0oaHR0cDovL3JtYXJrZG93bi5yc3R1ZGlvLmNvbSkgTm90ZWJvb2suIFdoZW4geW91IGV4ZWN1dGUgY29kZSB3aXRoaW4gdGhlIG5vdGVib29rLCB0aGUgcmVzdWx0cyBhcHBlYXIgYmVuZWF0aCB0aGUgY29kZS4gCgpUcnkgZXhlY3V0aW5nIHRoaXMgY2h1bmsgYnkgY2xpY2tpbmcgdGhlICpSdW4qIGJ1dHRvbiB3aXRoaW4gdGhlIGNodW5rIG9yIGJ5IHBsYWNpbmcgeW91ciBjdXJzb3IgaW5zaWRlIGl0IGFuZCBwcmVzc2luZyAqQ21kK1NoaWZ0K0VudGVyKi4gCgpgYGB7cn0KcGxvdChjYXJzKQpgYGAKCkFkZCBhIG5ldyBjaHVuayBieSBjbGlja2luZyB0aGUgKkluc2VydCBDaHVuayogYnV0dG9uIG9uIHRoZSB0b29sYmFyIG9yIGJ5IHByZXNzaW5nICpDbWQrT3B0aW9uK0kqLgoKV2hlbiB5b3Ugc2F2ZSB0aGUgbm90ZWJvb2ssIGFuIEhUTUwgZmlsZSBjb250YWluaW5nIHRoZSBjb2RlIGFuZCBvdXRwdXQgd2lsbCBiZSBzYXZlZCBhbG9uZ3NpZGUgaXQgKGNsaWNrIHRoZSAqUHJldmlldyogYnV0dG9uIG9yIHByZXNzICpDbWQrU2hpZnQrSyogdG8gcHJldmlldyB0aGUgSFRNTCBmaWxlKS4gCgpUaGUgcHJldmlldyBzaG93cyB5b3UgYSByZW5kZXJlZCBIVE1MIGNvcHkgb2YgdGhlIGNvbnRlbnRzIG9mIHRoZSBlZGl0b3IuIENvbnNlcXVlbnRseSwgdW5saWtlICpLbml0KiwgKlByZXZpZXcqIGRvZXMgbm90IHJ1biBhbnkgUiBjb2RlIGNodW5rcy4gSW5zdGVhZCwgdGhlIG91dHB1dCBvZiB0aGUgY2h1bmsgd2hlbiBpdCB3YXMgbGFzdCBydW4gaW4gdGhlIGVkaXRvciBpcyBkaXNwbGF5ZWQuCgpgYGB7cn0KIyA9PT0gU3RlcCAxOiBJbnN0YWxsIGFuZCBsb2FkIHJlcXVpcmVkIHBhY2thZ2VzID09PQppZiAoIXJlcXVpcmUoInJlYWRyIikpIGluc3RhbGwucGFja2FnZXMoInJlYWRyIikKaWYgKCFyZXF1aXJlKCJkcGx5ciIpKSBpbnN0YWxsLnBhY2thZ2VzKCJkcGx5ciIpCmlmICghcmVxdWlyZSgiZ2dwbG90MiIpKSBpbnN0YWxsLnBhY2thZ2VzKCJnZ3Bsb3QyIikKCmxpYnJhcnkocmVhZHIpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoZ2dwbG90MikKCiMgPT09IFN0ZXAgMjogTG9hZCB0aGUgZGF0YXNldCA9PT0KIyBNYWtlIHN1cmUgdGhpcyBDU1YgaXMgaW4geW91ciB3b3JraW5nIGRpcmVjdG9yeSAodXNlIHNldHdkKCkgb3IgdXBsb2FkIGluIFBvc2l0IENsb3VkKQpkZiA8LSByZWFkX2NzdigiL1VzZXJzL2VyaWNrd2FrL0Rlc2t0b3AvcnNjb25uZWN0L3N5bnRoZXRpY19iZXZlcmFnZV9zYWxlc19kYXRhXzEwMG1iLmNzdiIpCgojIENoZWNrIHN0cnVjdHVyZSBhbmQgZmlyc3QgZmV3IHJvd3MKc3RyKGRmKQpoZWFkKGRmKQoKIyA9PT0gU3RlcCAzOiBDb21wdXRlIHRvdGFsIHJldmVudWUgcGVyIHJlZ2lvbiA9PT0KcmVnaW9uX3JldmVudWUgPC0gZGYgJT4lCiAgZ3JvdXBfYnkoUmVnaW9uKSAlPiUKICBzdW1tYXJpc2UoVG90YWxfUmV2ZW51ZSA9IHN1bShUb3RhbF9QcmljZSwgbmEucm0gPSBUUlVFKSkgJT4lCiAgYXJyYW5nZShkZXNjKFRvdGFsX1JldmVudWUpKQoKIyBWaWV3IHRvcCA1IHJlZ2lvbnMKdG9wX3JlZ2lvbnMgPC0gaGVhZChyZWdpb25fcmV2ZW51ZSRSZWdpb24sIDUpCnByaW50KCJUb3AgNSByZWdpb25zIGJ5IHJldmVudWU6IikKcHJpbnQodG9wX3JlZ2lvbnMpCgojID09PSBTdGVwIDQ6IEZpbHRlciBkYXRhIHRvIGluY2x1ZGUgb25seSB0aGUgdG9wIDUgcmVnaW9ucyA9PT0KdG9wX3JlZ2lvbl9kYXRhIDwtIGRmICU+JQogIGZpbHRlcihSZWdpb24gJWluJSB0b3BfcmVnaW9ucykKCiMgPT09IFN0ZXAgNTogQ29tcHV0ZSByZXZlbnVlIGJ5IFJlZ2lvbiBhbmQgQ2F0ZWdvcnkgPT09CnJlZ2lvbl9jYXRlZ29yeV9yZXZlbnVlIDwtIHRvcF9yZWdpb25fZGF0YSAlPiUKICBncm91cF9ieShSZWdpb24sIENhdGVnb3J5KSAlPiUKICBzdW1tYXJpc2UoQ2F0ZWdvcnlfUmV2ZW51ZSA9IHN1bShUb3RhbF9QcmljZSwgbmEucm0gPSBUUlVFKSkgJT4lCiAgdW5ncm91cCgpCgojIFZpZXcgdGhlIHJlc3VsdApwcmludCgiU2FtcGxlIG9mIHJlZ2lvbl9jYXRlZ29yeV9yZXZlbnVlOiIpCnByaW50KGhlYWQocmVnaW9uX2NhdGVnb3J5X3JldmVudWUpKQoKIyA9PT0gU3RlcCA2OiBQbG90IHJldmVudWUgYnkgYmV2ZXJhZ2UgY2F0ZWdvcnkgaW4gdG9wIDUgcmVnaW9ucyA9PT0KZ2dwbG90KHJlZ2lvbl9jYXRlZ29yeV9yZXZlbnVlLCBhZXMoeCA9IENhdGVnb3J5X1JldmVudWUsIHkgPSBSZWdpb24sIGZpbGwgPSBDYXRlZ29yeSkpICsKICBnZW9tX2JhcihzdGF0ID0gImlkZW50aXR5IikgKwogIGxhYnMoCiAgICB0aXRsZSA9ICJSZXZlbnVlIGJ5IEJldmVyYWdlIENhdGVnb3J5IGluIFRvcCA1IFJlZ2lvbnMiLAogICAgeCA9ICJUb3RhbCBSZXZlbnVlIiwKICAgIHkgPSAiUmVnaW9uIgogICkgKwogIHRoZW1lX21pbmltYWwoKQpgYGAKYGBge3J9CiMgTG9hZCByZXF1aXJlZCBwYWNrYWdlcwpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KHJlYWRyKQoKIyBMb2FkIGRhdGEgKHNraXAgaWYgYWxyZWFkeSBsb2FkZWQpCiMgZGYgPC0gcmVhZF9jc3YoInN5bnRoZXRpY19iZXZlcmFnZV9zYWxlc19kYXRhXzEwMG1iLmNzdiIpCgojIFN1bW1hcml6ZSB0b3RhbCByZXZlbnVlIGJ5IHJlZ2lvbgpyZWdpb25fcmV2ZW51ZV90YWJsZSA8LSBkZiAlPiUKICBncm91cF9ieShSZWdpb24pICU+JQogIHN1bW1hcmlzZShUb3RhbF9SZXZlbnVlID0gc3VtKFRvdGFsX1ByaWNlLCBuYS5ybSA9IFRSVUUpKSAlPiUKICBhcnJhbmdlKGRlc2MoVG90YWxfUmV2ZW51ZSkpICU+JQogIHNsaWNlKDE6NSkgJT4lCiAgbXV0YXRlKFJhbmsgPSByb3dfbnVtYmVyKCkpICU+JQogIHNlbGVjdChSYW5rLCBSZWdpb24sIFRvdGFsX1JldmVudWUpCgojIFByaW50IHRhYmxlCnByaW50KHJlZ2lvbl9yZXZlbnVlX3RhYmxlKQpgYGAKCg==