Midterm Exam

Data Science Programming 1

1 Section A

1.1 Logical Approach and Methodology

This section focuses on automating the collection and integration of data from multiple sources. To meet the assignment criteria, five different file formats—CSV, Excel, JSON, TXT, and XML—are processed.

The workflow is organized into three main stages:

Data Acquisition: Using appropriate R libraries to read each file format, ensuring the data is ready for further processing.
Structural Validation: A looping mechanism combined with if-else logic is used to inspect the rows and columns of each file. This step verifies whether the column structure matches the reference standard before any merging occurs.
Data Integration: All validated datasets are consolidated into a single master dataset named df_utama. This stage includes standardizing data types and removing duplicate records to ensure the final output is clean and consistent.

# ============================================================
# SECTION A – DATA COLLECTION USING PROGRAMMING
# ============================================================

# Loading essential libraries for data manipulation and reporting
library(readr)
library(readxl)
library(jsonlite)
library(XML)
library(dplyr)
library(knitr)
library(DT)

# --- STEP 1: Data Acquisition (Reading multiple file formats) ---
df_csv  <- read_csv("ecommerce.csv", show_col_types = FALSE)
df_xlsx <- read_excel("ecommerce.xlsx", sheet = "ecommerce_raw")
df_json <- fromJSON("ecommerce.json")
if (!is.data.frame(df_json)) { df_json <- as.data.frame(df_json) }
df_txt  <- read_delim("ecommerce.txt", delim = "|", show_col_types = FALSE)
xml_data <- xmlParse("ecommerce.xml")
df_xml   <- xmlToDataFrame(nodes = getNodeSet(xml_data, "//Record"), stringsAsFactors = FALSE)

# ============================================================
# STEP 2-4: Automation, Validation, and Integration
# ============================================================

file_list <- list(CSV = df_csv, XLSX = df_xlsx, JSON = df_json, TXT = df_txt, XML = df_xml)

# --- TASK REQUIREMENT: Display rows, columns, and column names per file ---
cat("=== FILE-BY-FILE SUMMARY ===\n\n")

## === FILE-BY-FILE SUMMARY ===

for (file_name in names(file_list)) {
  df_temp <- file_list[[file_name]]
  cat(sprintf("File     : %s\n", file_name))
  cat(sprintf("Rows     : %d\n", nrow(df_temp)))
  cat(sprintf("Columns  : %d\n", ncol(df_temp)))
  cat(sprintf("Col Names: %s\n\n", paste(names(df_temp), collapse = ", ")))
}

## File     : CSV
## Rows     : 2000
## Columns  : 22
## Col Names: order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
## 
## File     : XLSX
## Rows     : 2000
## Columns  : 22
## Col Names: order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
## 
## File     : JSON
## Rows     : 2000
## Columns  : 22
## Col Names: order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
## 
## File     : TXT
## Rows     : 2000
## Columns  : 22
## Col Names: order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
## 
## File     : XML
## Rows     : 2000
## Columns  : 22
## Col Names: order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag

# --- DATA SUMMARY (Interactive Table) ---
ringkasan_df <- data.frame(
  File         = names(file_list),
  Rows         = sapply(file_list, nrow),
  Columns      = sapply(file_list, ncol),
  Column_Names = sapply(file_list, function(df) paste(names(df), collapse = ", ")),
  row.names    = NULL
)
datatable(ringkasan_df,
          caption  = "Data Summary per File (Rows, Columns & Column Names)",
          options  = list(pageLength = 5, scrollX = TRUE, dom = 'ftp'),
          rownames = FALSE)

# --- COLUMN VALIDATION using FOR LOOP + IF/ELSE ---
reference_cols <- names(df_csv)
files_ready    <- list()
validasi_df    <- data.frame(File = character(), Rows = integer(),
                              Columns = integer(), Status = character(),
                              stringsAsFactors = FALSE)

for (file_name in names(file_list)) {
  df_temp   <- file_list[[file_name]]
  ref_norm  <- tolower(trimws(reference_cols))
  curr_norm <- tolower(trimws(names(df_temp)))

  # IF/ELSE: check whether column structure matches the reference
  if (identical(sort(ref_norm), sort(curr_norm))) {
    status <- "Ready to merge ✓"
    files_ready[[file_name]] <- df_temp
  } else {
    status <- "Need adjustment ✗"
  }
  validasi_df <- rbind(validasi_df, data.frame(
    File    = file_name,
    Rows    = nrow(df_temp),
    Columns = ncol(df_temp),
    Status  = status,
    stringsAsFactors = FALSE
  ))
}

datatable(validasi_df,
          caption  = "Column Structure Validation per File",
          options  = list(pageLength = 5, dom = 'ftp'),
          rownames = FALSE)

# --- Data Merging into df_utama ---
if (length(files_ready) > 0) {
  standardize_cols <- function(df, ref_cols) {
    df <- df[, ref_cols[ref_cols %in% names(df)], drop = FALSE]
    df[] <- lapply(df, as.character)
    return(df)
  }

  files_std <- lapply(files_ready, standardize_cols, ref_cols = reference_cols)
  for (fname in names(files_std)) { files_std[[fname]]$source_file <- fname }

  # Final Consolidated Dataset
  df_utama <- bind_rows(files_std) %>%
    distinct(order_id, source_file, .keep_all = TRUE)

  cat("- Final row count in df_utama:", nrow(df_utama), "\n")
  cat("- Total columns              :", ncol(df_utama), "\n")
  cat("- Data is ready as 'df_utama'\n\n")

  # Interactive preview of merged dataset
  datatable(head(df_utama, 10),
            caption  = "Preview of df_utama — Top 10 Rows (All Columns)",
            options  = list(pageLength = 5, scrollX = TRUE),
            rownames = FALSE)

} else {
  cat("\n[ERROR] No datasets matched requirements. Merging cancelled.\n")
}

## - Final row count in df_utama: 9830 
## - Total columns              : 23 
## - Data is ready as 'df_utama'

2 Section B

2.1 Logical Approach and Data Handling Methodology

Following the data integration in Section A, an evaluation of df_utama is conducted to ensure analytical quality and accuracy. This process focuses on three key areas:

Dataset Overview: Validating final dimensions and inspecting data types for each column to ensure consistent variable structures.
Data Integrity Check: Detecting data irregularities, including identifying missing values (system NA and empty strings) and duplicate rows to prevent double-counting.
Quality Issue Identification: Critically analyzing variable format anomalies and technical risks arising from merging data from diverse source formats.

2.2 R Code Implementation and Analysis Output

library(DT)

# ============================================================
# SECTION B – DATA HANDLING
# ============================================================
# TASK 1: Display total rows, columns, and data types
cat("=== TASK 1: Dataset Overview ===\n")
cat("Total Rows    :", nrow(df_utama), "\n")
cat("Total Columns:", ncol(df_utama), "\n\n")

tipe_df <- data.frame(
  Column    = names(df_utama),
  Data_Type = sapply(df_utama, class),
  row.names = NULL
)
datatable(tipe_df,
          caption  = "Data Type per Column in df_utama",
          options  = list(pageLength = 10, scrollX = TRUE),
          rownames = FALSE)

# TASK 2: Identify Missing Values & Duplicate Rows
cat("\n=== TASK 2: Missing Values & Duplicate Rows ===\n")

missing_df <- data.frame(
  Column        = names(df_utama),
  Missing_Count = sapply(df_utama, function(x) sum(is.na(x) | trimws(as.character(x)) == "")),
  row.names     = NULL
)
missing_df$Percentage <- paste0(
  round(missing_df$Missing_Count / nrow(df_utama) * 100, 2), "%"
)
datatable(missing_df,
          caption  = "Missing Values per Column",
          options  = list(pageLength = 10, scrollX = TRUE),
          rownames = FALSE)

cat("Total Missing Values (all columns):", sum(missing_df$Missing_Count), "\n")

dup_count <- sum(duplicated(df_utama))
cat("Total Duplicate Rows:", dup_count, "\n")

if (dup_count > 0) {
  datatable(head(df_utama[duplicated(df_utama), ], 5),
            caption  = "Sample Duplicate Rows",
            options  = list(pageLength = 5, scrollX = TRUE),
            rownames = FALSE)
}

# TASK 3: At Least 3 Data Quality Issues
total_missing <- sum(missing_df$Missing_Count)
cat("\n=== TASK 3: Data Quality Issues Found ===\n\n")

cat("1. MISSING VALUES\n")
cat("   Found", total_missing, "empty/NA values across the dataset.\n")
cat("   Most affected column:",
    missing_df$Column[which.max(missing_df$Missing_Count)],
    "with", max(missing_df$Missing_Count), "missing values.\n\n")

cat("2. DUPLICATE ROWS\n")
cat("   Found", dup_count, "duplicate rows which may lead to\n")
cat("   double-counting in analysis and reporting.\n\n")

char_cols <- sum(sapply(df_utama, is.character))
cat("3. INCONSISTENT DATA TYPES\n")
cat("  ", char_cols, "out of", ncol(df_utama),
    "columns are 'character' type, including numerical columns such as\n")
cat("   'price', 'quantity', etc., which should be numeric/integer.\n\n")

source_summary <- table(df_utama$source_file)
cat("4. DATA ORIGINATED FROM VARIOUS FILE FORMATS\n")
cat("   Dataset merged from", length(source_summary), "different file sources.\n")
cat("   Different formats pose a risk of inconsistent encoding and separators.\n")

## === TASK 1: Dataset Overview ===
## Total Rows    : 9830 
## Total Columns: 23

## 
## === TASK 2: Missing Values & Duplicate Rows ===

## Total Missing Values (all columns): 4700 
## Total Duplicate Rows: 0 
## 
## === TASK 3: Data Quality Issues Found ===
## 
## 1. MISSING VALUES
##    Found 4700 empty/NA values across the dataset.
##    Most affected column: customer_rating with 2005 missing values.
## 
## 2. DUPLICATE ROWS
##    Found 0 duplicate rows which may lead to
##    double-counting in analysis and reporting.
## 
## 3. INCONSISTENT DATA TYPES
##    23 out of 23 columns are 'character' type, including numerical columns such as
##    'price', 'quantity', etc., which should be numeric/integer.
## 
## 4. DATA ORIGINATED FROM VARIOUS FILE FORMATS
##    Dataset merged from 5 different file sources.
##    Different formats pose a risk of inconsistent encoding and separators.

3 Section C

3.1 Data Cleaning Logical Framework

This stage focuses on executing the cleaning of df_utama using programmatic logic to ensure the dataset is analysis-ready. The process covers three key areas:

Price Normalization: Removing currency formats (Rp) and enforcing non-negative constraints to achieve mathematical consistency.
Missing Value Handling: Filling empty payment_method entries with “Unknown” and customer_rating with the median. The median is chosen for its robustness against extreme outliers.
Categorical Standardization Loop: Utilizing a single loop mechanism to perform whitespace trimming and unify text formatting across platform, order_status, and payment_method simultaneously.

3.2 Technical Implementation of Data Cleaning

# ============================================================
# SECTION C – DATA CLEANING
# ============================================================
cat("=== SECTION C: DATA CLEANING ===\n\n")

df_clean <- df_utama

# ============================================================
# TASK 2: Price / Sales Value Cleaning — ALL 4 NUMERIC COLUMNS
# Logic: Remove 'Rp' prefix, strip thousand-separator dots,
#        convert to numeric, force any value < 0 to 0.
# ============================================================
cat("\n--- Task 2: Currency Cleaning (4 Numeric Columns) ---\n")

# Reusable helper function
clean_currency_vec <- function(vec) {
  result <- vector("numeric", length(vec))
  for (i in seq_along(vec)) {
    val <- as.character(vec[i])
    if (is.na(val) || trimws(val) == "" || toupper(trimws(val)) == "NA") {
      result[i] <- NA_real_
      next
    }
    # Remove 'Rp' prefix and thousand-separator dots
    if (grepl("Rp", trimws(val), ignore.case = TRUE)) {
      val <- gsub("Rp\\s*", "", val, ignore.case = TRUE)
      val <- gsub("\\.", "", val)
      val <- gsub(",", ".", val)
    }
    num <- suppressWarnings(as.numeric(trimws(val)))
    # IF: enforce non-negative — any value < 0 becomes 0
    if (!is.na(num) && num < 0) {
      result[i] <- 0
    } else {
      result[i] <- ifelse(is.na(num), NA_real_, num)
    }
  }
  return(result)
}

# MANDATORY LOOP: iterate over all 4 financial columns
numeric_cols <- c("unit_price", "gross_sales", "net_sales", "discount_value")

for (col in numeric_cols) {
  df_clean[[col]] <- clean_currency_vec(df_clean[[col]])
  cat(sprintf("  ✔ %-20s → numeric | min: %s | max: %s\n",
              col,
              format(round(min(df_clean[[col]], na.rm = TRUE)), big.mark = ","),
              format(round(max(df_clean[[col]], na.rm = TRUE)), big.mark = ",")))
}

# Interactive summary table for all 4 cleaned numeric columns
num_summary <- data.frame(
  Column   = numeric_cols,
  Min      = sapply(numeric_cols, function(c) round(min(df_clean[[c]], na.rm = TRUE))),
  Q1       = sapply(numeric_cols, function(c) round(quantile(df_clean[[c]], 0.25, na.rm = TRUE))),
  Median   = sapply(numeric_cols, function(c) round(median(df_clean[[c]], na.rm = TRUE))),
  Mean     = sapply(numeric_cols, function(c) round(mean(df_clean[[c]], na.rm = TRUE))),
  Max      = sapply(numeric_cols, function(c) round(max(df_clean[[c]], na.rm = TRUE))),
  NA_Count = sapply(numeric_cols, function(c) sum(is.na(df_clean[[c]]))),
  row.names = NULL
)
datatable(num_summary,
          caption  = "All 4 Numeric Columns — Summary After Currency Cleaning",
          options  = list(pageLength = 4, dom = 'ftp', scrollX = TRUE),
          rownames = FALSE)

# ============================================================
# TASK 3: Missing Value Handling (MANDATORY IF)
# ============================================================
cat("\n--- Task 3: Missing Value Handling ---\n")

# Logic: Median is chosen as the default for ratings because it is robust against outliers.
rating_median <- median(suppressWarnings(as.numeric(df_clean$customer_rating)), na.rm = TRUE)

for (i in 1:nrow(df_clean)) {
  if (is.na(df_clean$payment_method[i]) || trimws(as.character(df_clean$payment_method[i])) == "") {
    df_clean$payment_method[i] <- "Unknown"
  }
  if (is.na(df_clean$customer_rating[i]) || trimws(as.character(df_clean$customer_rating[i])) == "") {
    df_clean$customer_rating[i] <- rating_median
  }
}

df_clean$customer_rating <- suppressWarnings(as.numeric(df_clean$customer_rating))

missing_tbl <- data.frame(
  Column  = c("payment_method", "customer_rating"),
  Missing_After = c(
    sum(is.na(df_clean$payment_method) | trimws(df_clean$payment_method) == ""),
    sum(is.na(df_clean$customer_rating))
  ),
  Method  = c("Filled with 'Unknown'",
              paste0("Filled with Median (", rating_median, ")"))
)
kable(missing_tbl, format = "markdown", align = "l",
      caption = "Missing Value Handling — Result & Logic")

# ============================================================
# TASK 1 + 4 + 5: MANDATORY LOOPING — 4 Categorical Columns Simultaneously
# ============================================================
cat("\n--- Task 1 & 4 & 5: Loop-based Categorical Standardization ---\n")

# Logic: Using a single loop to process 4 columns at once is more efficient and ensures consistency.
for (i in 1:nrow(df_clean)) {

  # Column 1: platform 
  val_platform <- trimws(tolower(as.character(df_clean$platform[i])))
  if (val_platform == "shopee") {
    df_clean$platform[i] <- "Shopee"
  } else if (val_platform %in% c("tokped", "tokopedia")) {
    df_clean$platform[i] <- "Tokopedia"
  } else if (val_platform == "blibli") {
    df_clean$platform[i] <- "Blibli"
  } else if (val_platform == "lazada") {
    df_clean$platform[i] <- "Lazada"
  } else if (val_platform == "tiktok shop") {
    df_clean$platform[i] <- "TikTok Shop"
  }

  # Column 2: order_status 
  val_status <- trimws(tolower(as.character(df_clean$order_status[i])))
  if (val_status == "delivered" || val_status == "completed") {
    df_clean$order_status[i] <- "Completed"
  } else if (val_status == "cancelled" || val_status == "cancel" || val_status == "batal") {
    df_clean$order_status[i] <- "Cancelled"
  } else if (val_status == "on delivery") {
    df_clean$order_status[i] <- "On Delivery"
  } else if (val_status == "shipped") {
    df_clean$order_status[i] <- "Shipped"
  } else if (val_status == "retur" || val_status == "returned") {
    df_clean$order_status[i] <- "Returned"
  }

  # Column 3: category
  # Logic: Standardizes all case/format variations into 5 clean category names.
  val_category <- trimws(tolower(gsub("_", " ", as.character(df_clean$category[i]))))
  if (val_category == "beauty") {
    df_clean$category[i] <- "Beauty"
  } else if (val_category == "electronics") {
    df_clean$category[i] <- "Electronics"
  } else if (val_category == "fashion") {
    df_clean$category[i] <- "Fashion"
  } else if (val_category == "home living") {
    df_clean$category[i] <- "Home Living"
  } else if (val_category == "sports") {
    df_clean$category[i] <- "Sports"
  }

  # Column 4: payment_method (trimws final pass) 
  df_clean$payment_method[i] <- trimws(as.character(df_clean$payment_method[i]))
}

# Result Table Task 1
platform_tbl <- as.data.frame(table(df_clean$platform))
names(platform_tbl) <- c("Platform", "Count")
kable(platform_tbl, format = "markdown", align = "l",
      caption = "Platform Distribution After Cleaning")

# Result Table Task 4
status_tbl <- as.data.frame(table(df_clean$order_status))
names(status_tbl) <- c("Order Status", "Count")
kable(status_tbl, format = "markdown", align = "l",
      caption = "Order Status Distribution After Cleaning")

# Result Table Category
category_tbl <- as.data.frame(table(df_clean$category))
names(category_tbl) <- c("Category", "Count")
kable(category_tbl, format = "markdown", align = "l",
      caption = "Category Distribution After Cleaning")

# Evidence Table Task 5
loop_result <- data.frame(
  Column  = c("platform", "order_status", "category", "payment_method"),
  Action  = c("Platform name standardization (IF)", 
              "Order status standardization (IF)",
              "Category name standardization (IF)",
              "Whitespace cleaning (trimws)"),
  Status  = c("Done", "Done", "Done", "Done")
)
kable(loop_result, format = "markdown", align = "l",
      caption = "Loop Cleaning — 4 Columns Processed Simultaneously")

# ============================================================
# FINAL SUMMARY
# ============================================================
cat("\n=== CLEANING SUMMARY ===\n")
cat("Total Rows:", nrow(df_clean), "\n\n")

missing_after <- data.frame(
  Column        = names(df_clean),
  Missing_Count = sapply(df_clean, function(x) sum(is.na(x) | trimws(as.character(x)) == "")),
  Percentage    = paste0(round(sapply(df_clean, function(x) sum(is.na(x) | trimws(as.character(x)) == "")) / nrow(df_clean) * 100, 2), "%"),
  row.names     = NULL
)

library(DT)
datatable(missing_after, caption = "Missing Values Per Column After Cleaning",
          options = list(pageLength = 10, dom = 'ftp'), rownames = FALSE)

## === SECTION C: DATA CLEANING ===
## 
## 
## --- Task 2: Currency Cleaning (4 Numeric Columns) ---
##   ✔ unit_price           → numeric | min: 30,827 | max: 2,498,385
##   ✔ gross_sales          → numeric | min: 31,646 | max: 22,642,020
##   ✔ net_sales            → numeric | min: 0 | max: 18,080,224
##   ✔ discount_value       → numeric | min: 0 | max: 6,792,606

## 
## --- Task 3: Missing Value Handling ---

Missing Value Handling — Result & Logic
Column	Missing_After	Method
payment_method	0	Filled with ‘Unknown’
customer_rating	0	Filled with Median (5)

## 
## --- Task 1 & 4 & 5: Loop-based Categorical Standardization ---

Platform Distribution After Cleaning
Platform	Count
Blibli	2000
Lazada	1905
Shopee	2055
TikTok Shop	1985
Tokopedia	1885

Order Status Distribution After Cleaning
Order Status	Count
Cancelled	755
Completed	7750
On Delivery	235
Returned	585
Shipped	505

Category Distribution After Cleaning
Category	Count
Beauty	1935
Electronics	1785
Fashion	2060
Home Living	2010
Sports	2040

Loop Cleaning — 4 Columns Processed Simultaneously
Column	Action	Status
platform	Platform name standardization (IF)	Done
order_status	Order status standardization (IF)	Done
category	Category name standardization (IF)	Done
payment_method	Whitespace cleaning (trimws)	Done

## 
## === CLEANING SUMMARY ===
## Total Rows: 9830

4 Section D

4.1 Business Logic Analysis and Column Definition

In this section, new variables are constructed based on predefined business rules. The primary focus of this task is to implement branching logic using if, else if, and else structures within a looping mechanism to classify transaction data.

Three new columns are added to the dataset:

is_high_value: Used to identify transactions with net sales exceeding 1,000,000.
order_priority: Classifies order priority into three levels (High, Medium, Low) using a nested IF method based on the sales amount.
valid_transaction: Evaluates transaction validity based on order status, where any “Cancelled” order is flagged as an invalid transaction.

4.2 Procedural Implementation Using R

# ============================================================
# SECTION D – CONDITIONAL LOGIC
# ============================================================
cat("=== SECTION D: CONDITIONAL LOGIC ===\n\n")

# Ensure net_sales is numeric (already cleaned in Section C)
df_clean$net_sales <- suppressWarnings(as.numeric(df_clean$net_sales))

# ============================================================
# TASK 1: is_high_value
# ============================================================
cat("--- Task 1: is_high_value ---\n")

# Initialize the column with NA before processing
df_clean$is_high_value <- NA_character_

for (i in 1:nrow(df_clean)) {
  # Logic: Binary classification based on a 1M threshold.
  # Check for NA to prevent logical errors during comparison.
  if (!is.na(df_clean$net_sales[i]) && df_clean$net_sales[i] > 1000000) {
    df_clean$is_high_value[i] <- "Yes"
  } else {
    df_clean$is_high_value[i] <- "No"
  }
}

high_value_tbl <- as.data.frame(table(df_clean$is_high_value))
names(high_value_tbl) <- c("is_high_value", "Count")
kable(high_value_tbl, format = "markdown", align = "l",
      caption = "is_high_value Distribution")

# ============================================================
# TASK 2: order_priority (REQUIRED Nested IF)
# ============================================================
cat("\n--- Task 2: order_priority (Nested IF) ---\n")

df_clean$order_priority <- NA_character_

for (i in 1:nrow(df_clean)) {
  val <- df_clean$net_sales[i]
  # Logic: Multi-tier classification using Nested IF to segment sales performance.
  if (!is.na(val)) {
    if (val >= 1000000) {
      df_clean$order_priority[i] <- "High"
    } else if (val >= 500000) {
      # This part acts as the nested condition for values < 1,000,000
      df_clean$order_priority[i] <- "Medium"
    } else {
      df_clean$order_priority[i] <- "Low"
    }
  }
}

priority_tbl <- as.data.frame(table(df_clean$order_priority))
names(priority_tbl) <- c("order_priority", "Count")
kable(priority_tbl, format = "markdown", align = "l",
      caption = "order_priority Distribution (Nested IF)")

# ============================================================
# TASK 3: valid_transaction
# ============================================================
cat("\n--- Task 3: valid_transaction ---\n")

df_clean$valid_transaction <- NA_character_

for (i in 1:nrow(df_clean)) {
  # Logic: Categorical validation based on transaction status strings.
  # Flags "Cancelled" orders as "Invalid", otherwise "Valid".
  if (!is.na(df_clean$order_status[i]) && df_clean$order_status[i] == "Cancelled") {
    df_clean$valid_transaction[i] <- "Invalid"
  } else {
    df_clean$valid_transaction[i] <- "Valid"
  }
}

valid_tbl <- as.data.frame(table(df_clean$valid_transaction))
names(valid_tbl) <- c("valid_transaction", "Count")
kable(valid_tbl, format = "markdown", align = "l",
      caption = "valid_transaction Distribution")

# ============================================================
# FINAL PREVIEW – 3 New Columns (Interactive Table)
# ============================================================
cat("\n=== SECTION D SUMMARY – New Columns Added ===\n")

summary_d <- data.frame(
  Column      = c("is_high_value", "order_priority", "valid_transaction"),
  Logic       = c("net_sales > 1,000,000 → Yes / No",
                  ">= 1M → High | >= 500K → Medium | < 500K → Low",
                  "order_status = Cancelled → Invalid / Valid"),
  Type        = c("Binary IF", "Nested IF (3 levels)", "Binary IF"),
  Unique_Values = c(
    paste(sort(unique(df_clean$is_high_value)),   collapse = " / "),
    paste(sort(unique(df_clean$order_priority)),  collapse = " / "),
    paste(sort(unique(df_clean$valid_transaction)), collapse = " / ")
  )
)
kable(summary_d, format = "markdown", align = "l",
      caption = "Section D – New Columns & Logic Applied")

cat("\nSample Output (First 10 rows — Interactive):\n")
datatable(
  head(df_clean[, c("order_id", "net_sales", "order_status",
                     "is_high_value", "order_priority", "valid_transaction")], 10),
  caption  = "Sample: 3 New Columns Section D",
  options  = list(pageLength = 10, scrollX = TRUE, dom = 'ftp'),
  rownames = FALSE
)

## === SECTION D: CONDITIONAL LOGIC ===
## 
## --- Task 1: is_high_value ---

is_high_value Distribution
is_high_value	Count
No	6055
Yes	3775

## 
## --- Task 2: order_priority (Nested IF) ---

order_priority Distribution (Nested IF)
order_priority	Count
High	3775
Low	3995
Medium	2060

## 
## --- Task 3: valid_transaction ---

valid_transaction Distribution
valid_transaction	Count
Invalid	755
Valid	9075

## 
## === SECTION D SUMMARY – New Columns Added ===

Section D – New Columns & Logic Applied
Column	Logic	Type	Unique_Values
is_high_value	net_sales > 1,000,000 → Yes / No	Binary IF	No / Yes
order_priority	>= 1M → High \| >= 500K → Medium \| < 500K → Low	Nested IF (3 levels)	High / Low / Medium
valid_transaction	order_status = Cancelled → Invalid / Valid	Binary IF	Invalid / Valid

## 
## Sample Output (First 10 rows — Interactive):

5 Section E

5.1 Analytical Thinking — Business Insights from the Cleaned Dataset

The following three questions are answered using live computation from df_clean, filtering only valid transactions (valid_transaction == "Valid") to ensure accuracy.

# ============================================================
# SECTION E – ANALYTICAL THINKING
# ============================================================
cat("=== SECTION E: ANALYTICAL THINKING ===\n\n")

# Filter: only valid, non-duplicate transactions for accurate analysis
df_valid <- df_clean %>%
  filter(valid_transaction == "Valid") %>%
  distinct(order_id, .keep_all = TRUE)

cat(sprintf("Working dataset: %d valid unique transactions\n\n", nrow(df_valid)))

# ── Q1: Which platform is the most dominant? ─────────────────────────────────
cat("--- Q1: Most Dominant Platform ---\n")
platform_counts <- sort(table(df_valid$platform), decreasing = TRUE)
platform_df     <- as.data.frame(platform_counts)
names(platform_df) <- c("Platform", "Transactions")
platform_df$Share_Pct <- paste0(
  round(platform_df$Transactions / sum(platform_df$Transactions) * 100, 1), "%"
)

top_platform <- as.character(platform_df$Platform[1])
top_platform_n <- platform_df$Transactions[1]

datatable(platform_df,
          caption  = "Q1 — Transaction Count by Platform (Sorted Descending)",
          options  = list(pageLength = 5, dom = 'ftp'),
          rownames = FALSE)

cat(sprintf(
  "\nAnswer: The most dominant platform is %s with %d transactions (%s of valid transactions).\n",
  top_platform, top_platform_n, platform_df$Share_Pct[1]
))

# ── Q2: Which category appears most frequently? ───────────────────────────────
cat("\n--- Q2: Most Frequent Product Category ---\n")
category_counts <- sort(table(df_valid$category), decreasing = TRUE)
category_df     <- as.data.frame(category_counts)
names(category_df) <- c("Category", "Transactions")
category_df$Share_Pct <- paste0(
  round(category_df$Transactions / sum(category_df$Transactions) * 100, 1), "%"
)

top_category   <- as.character(category_df$Category[1])
top_category_n <- category_df$Transactions[1]

datatable(category_df,
          caption  = "Q2 — Transaction Count by Category (Sorted Descending)",
          options  = list(pageLength = 5, dom = 'ftp'),
          rownames = FALSE)

cat(sprintf(
  "\nAnswer: The most frequent category is %s with %d transactions (%s of valid transactions).\n",
  top_category, top_category_n, category_df$Share_Pct[1]
))

# ── Q3: What is the most common transaction status? ───────────────────────────
cat("\n--- Q3: Most Common Transaction Status ---\n")
# Use full df_clean for a complete status picture (including Invalid)
status_counts <- sort(table(df_clean$order_status), decreasing = TRUE)
status_df     <- as.data.frame(status_counts)
names(status_df) <- c("Order_Status", "Count")
status_df$Share_Pct <- paste0(
  round(status_df$Count / sum(status_df$Count) * 100, 1), "%"
)

top_status   <- as.character(status_df$Order_Status[1])
top_status_n <- status_df$Count[1]

datatable(status_df,
          caption  = "Q3 — Transaction Count by Order Status (Sorted Descending)",
          options  = list(pageLength = 5, dom = 'ftp'),
          rownames = FALSE)

cat(sprintf(
  "\nAnswer: The most common transaction status is %s with %d records (%s of all transactions).\n",
  top_status, top_status_n, status_df$Share_Pct[1]
))

# ── Final Dataset Snapshot ────────────────────────────────────────────────────
cat("\n=== FINAL DATASET SNAPSHOT ===\n")

snapshot <- data.frame(
  Metric = c(
    "Total Rows in df_clean", "Total Columns",
    "Valid Transactions", "Invalid (Cancelled)",
    "High-Value Orders (net_sales > 1M)",
    "Most Dominant Platform", "Top Product Category", "Top Order Status"
  ),
  Value = c(
    nrow(df_clean), ncol(df_clean),
    sum(df_clean$valid_transaction == "Valid",   na.rm = TRUE),
    sum(df_clean$valid_transaction == "Invalid", na.rm = TRUE),
    sum(df_clean$is_high_value == "Yes",         na.rm = TRUE),
    top_platform, top_category, top_status
  )
)

datatable(snapshot,
          caption  = "Final Dataset Snapshot — Section E Summary",
          options  = list(pageLength = 10, dom = 'ftp'),
          rownames = FALSE)

## === SECTION E: ANALYTICAL THINKING ===
## 
## Working dataset: 1815 valid unique transactions
## 
## --- Q1: Most Dominant Platform ---

## 
## Answer: The most dominant platform is Shopee with 375 transactions (20.7% of valid transactions).
## 
## --- Q2: Most Frequent Product Category ---

## 
## Answer: The most frequent category is Fashion with 386 transactions (21.3% of valid transactions).
## 
## --- Q3: Most Common Transaction Status ---

## 
## Answer: The most common transaction status is Completed with 7750 records (78.8% of all transactions).
## 
## === FINAL DATASET SNAPSHOT ===