Midterm Exam
Data Science Programming 1
Class A · Semester Genap 2026
& Statistical Computing
1 Section A
1.1 Logical Approach and Methodology
This section focuses on automating the collection and integration of data from multiple sources. To meet the assignment criteria, five different file formats—CSV, Excel, JSON, TXT, and XML—are processed.
The workflow is organized into three main stages:
- Data Acquisition: Using appropriate R libraries to read each file format, ensuring the data is ready for further processing.
- Structural Validation: A looping mechanism combined with if-else logic is used to inspect the rows and columns of each file. This step verifies whether the column structure matches the reference standard before any merging occurs.
- Data Integration: All validated datasets are consolidated into a single master dataset named df_utama. This stage includes standardizing data types and removing duplicate records to ensure the final output is clean and consistent.
# ============================================================
# SECTION A – DATA COLLECTION USING PROGRAMMING
# ============================================================
# Loading essential libraries for data manipulation and reporting
library(readr)
library(readxl)
library(jsonlite)
library(XML)
library(dplyr)
library(knitr)
library(DT)
# --- STEP 1: Data Acquisition (Reading multiple file formats) ---
df_csv <- read_csv("ecommerce.csv", show_col_types = FALSE)
df_xlsx <- read_excel("ecommerce.xlsx", sheet = "ecommerce_raw")
df_json <- fromJSON("ecommerce.json")
if (!is.data.frame(df_json)) { df_json <- as.data.frame(df_json) }
df_txt <- read_delim("ecommerce.txt", delim = "|", show_col_types = FALSE)
xml_data <- xmlParse("ecommerce.xml")
df_xml <- xmlToDataFrame(nodes = getNodeSet(xml_data, "//Record"), stringsAsFactors = FALSE)
# ============================================================
# STEP 2-4: Automation, Validation, and Integration
# ============================================================
file_list <- list(CSV = df_csv, XLSX = df_xlsx, JSON = df_json, TXT = df_txt, XML = df_xml)
# --- TASK REQUIREMENT: Display rows, columns, and column names per file ---
cat("=== FILE-BY-FILE SUMMARY ===\n\n")## === FILE-BY-FILE SUMMARY ===
for (file_name in names(file_list)) {
df_temp <- file_list[[file_name]]
cat(sprintf("File : %s\n", file_name))
cat(sprintf("Rows : %d\n", nrow(df_temp)))
cat(sprintf("Columns : %d\n", ncol(df_temp)))
cat(sprintf("Col Names: %s\n\n", paste(names(df_temp), collapse = ", ")))
}## File : CSV
## Rows : 2000
## Columns : 22
## Col Names: order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
##
## File : XLSX
## Rows : 2000
## Columns : 22
## Col Names: order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
##
## File : JSON
## Rows : 2000
## Columns : 22
## Col Names: order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
##
## File : TXT
## Rows : 2000
## Columns : 22
## Col Names: order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
##
## File : XML
## Rows : 2000
## Columns : 22
## Col Names: order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
# --- DATA SUMMARY (Interactive Table) ---
ringkasan_df <- data.frame(
File = names(file_list),
Rows = sapply(file_list, nrow),
Columns = sapply(file_list, ncol),
Column_Names = sapply(file_list, function(df) paste(names(df), collapse = ", ")),
row.names = NULL
)
datatable(ringkasan_df,
caption = "Data Summary per File (Rows, Columns & Column Names)",
options = list(pageLength = 5, scrollX = TRUE, dom = 'ftp'),
rownames = FALSE)# --- COLUMN VALIDATION using FOR LOOP + IF/ELSE ---
reference_cols <- names(df_csv)
files_ready <- list()
validasi_df <- data.frame(File = character(), Rows = integer(),
Columns = integer(), Status = character(),
stringsAsFactors = FALSE)
for (file_name in names(file_list)) {
df_temp <- file_list[[file_name]]
ref_norm <- tolower(trimws(reference_cols))
curr_norm <- tolower(trimws(names(df_temp)))
# IF/ELSE: check whether column structure matches the reference
if (identical(sort(ref_norm), sort(curr_norm))) {
status <- "Ready to merge ✓"
files_ready[[file_name]] <- df_temp
} else {
status <- "Need adjustment ✗"
}
validasi_df <- rbind(validasi_df, data.frame(
File = file_name,
Rows = nrow(df_temp),
Columns = ncol(df_temp),
Status = status,
stringsAsFactors = FALSE
))
}
datatable(validasi_df,
caption = "Column Structure Validation per File",
options = list(pageLength = 5, dom = 'ftp'),
rownames = FALSE)# --- Data Merging into df_utama ---
if (length(files_ready) > 0) {
standardize_cols <- function(df, ref_cols) {
df <- df[, ref_cols[ref_cols %in% names(df)], drop = FALSE]
df[] <- lapply(df, as.character)
return(df)
}
files_std <- lapply(files_ready, standardize_cols, ref_cols = reference_cols)
for (fname in names(files_std)) { files_std[[fname]]$source_file <- fname }
# Final Consolidated Dataset
df_utama <- bind_rows(files_std) %>%
distinct(order_id, source_file, .keep_all = TRUE)
cat("- Final row count in df_utama:", nrow(df_utama), "\n")
cat("- Total columns :", ncol(df_utama), "\n")
cat("- Data is ready as 'df_utama'\n\n")
# Interactive preview of merged dataset
datatable(head(df_utama, 10),
caption = "Preview of df_utama — Top 10 Rows (All Columns)",
options = list(pageLength = 5, scrollX = TRUE),
rownames = FALSE)
} else {
cat("\n[ERROR] No datasets matched requirements. Merging cancelled.\n")
}## - Final row count in df_utama: 9830
## - Total columns : 23
## - Data is ready as 'df_utama'
2 Section B
2.1 Logical Approach and Data Handling Methodology
Following the data integration in Section A, an evaluation of df_utama is conducted to ensure analytical quality and accuracy. This process focuses on three key areas:
Dataset Overview: Validating final dimensions and inspecting data types for each column to ensure consistent variable structures.
Data Integrity Check: Detecting data irregularities, including identifying missing values (system NA and empty strings) and duplicate rows to prevent double-counting.
Quality Issue Identification: Critically analyzing variable format anomalies and technical risks arising from merging data from diverse source formats.
2.2 R Code Implementation and Analysis Output
library(DT)
# ============================================================
# SECTION B – DATA HANDLING
# ============================================================
# TASK 1: Display total rows, columns, and data types
cat("=== TASK 1: Dataset Overview ===\n")
cat("Total Rows :", nrow(df_utama), "\n")
cat("Total Columns:", ncol(df_utama), "\n\n")
tipe_df <- data.frame(
Column = names(df_utama),
Data_Type = sapply(df_utama, class),
row.names = NULL
)
datatable(tipe_df,
caption = "Data Type per Column in df_utama",
options = list(pageLength = 10, scrollX = TRUE),
rownames = FALSE)
# TASK 2: Identify Missing Values & Duplicate Rows
cat("\n=== TASK 2: Missing Values & Duplicate Rows ===\n")
missing_df <- data.frame(
Column = names(df_utama),
Missing_Count = sapply(df_utama, function(x) sum(is.na(x) | trimws(as.character(x)) == "")),
row.names = NULL
)
missing_df$Percentage <- paste0(
round(missing_df$Missing_Count / nrow(df_utama) * 100, 2), "%"
)
datatable(missing_df,
caption = "Missing Values per Column",
options = list(pageLength = 10, scrollX = TRUE),
rownames = FALSE)
cat("Total Missing Values (all columns):", sum(missing_df$Missing_Count), "\n")
dup_count <- sum(duplicated(df_utama))
cat("Total Duplicate Rows:", dup_count, "\n")
if (dup_count > 0) {
datatable(head(df_utama[duplicated(df_utama), ], 5),
caption = "Sample Duplicate Rows",
options = list(pageLength = 5, scrollX = TRUE),
rownames = FALSE)
}
# TASK 3: At Least 3 Data Quality Issues
total_missing <- sum(missing_df$Missing_Count)
cat("\n=== TASK 3: Data Quality Issues Found ===\n\n")
cat("1. MISSING VALUES\n")
cat(" Found", total_missing, "empty/NA values across the dataset.\n")
cat(" Most affected column:",
missing_df$Column[which.max(missing_df$Missing_Count)],
"with", max(missing_df$Missing_Count), "missing values.\n\n")
cat("2. DUPLICATE ROWS\n")
cat(" Found", dup_count, "duplicate rows which may lead to\n")
cat(" double-counting in analysis and reporting.\n\n")
char_cols <- sum(sapply(df_utama, is.character))
cat("3. INCONSISTENT DATA TYPES\n")
cat(" ", char_cols, "out of", ncol(df_utama),
"columns are 'character' type, including numerical columns such as\n")
cat(" 'price', 'quantity', etc., which should be numeric/integer.\n\n")
source_summary <- table(df_utama$source_file)
cat("4. DATA ORIGINATED FROM VARIOUS FILE FORMATS\n")
cat(" Dataset merged from", length(source_summary), "different file sources.\n")
cat(" Different formats pose a risk of inconsistent encoding and separators.\n")## === TASK 1: Dataset Overview ===
## Total Rows : 9830
## Total Columns: 23
##
## === TASK 2: Missing Values & Duplicate Rows ===
## Total Missing Values (all columns): 4700
## Total Duplicate Rows: 0
##
## === TASK 3: Data Quality Issues Found ===
##
## 1. MISSING VALUES
## Found 4700 empty/NA values across the dataset.
## Most affected column: customer_rating with 2005 missing values.
##
## 2. DUPLICATE ROWS
## Found 0 duplicate rows which may lead to
## double-counting in analysis and reporting.
##
## 3. INCONSISTENT DATA TYPES
## 23 out of 23 columns are 'character' type, including numerical columns such as
## 'price', 'quantity', etc., which should be numeric/integer.
##
## 4. DATA ORIGINATED FROM VARIOUS FILE FORMATS
## Dataset merged from 5 different file sources.
## Different formats pose a risk of inconsistent encoding and separators.
3 Section C
3.1 Data Cleaning Logical Framework
This stage focuses on executing the cleaning of df_utama using programmatic logic to ensure the dataset is analysis-ready. The process covers three key areas:
Price Normalization: Removing currency formats (Rp) and enforcing non-negative constraints to achieve mathematical consistency.
Missing Value Handling: Filling empty payment_method entries with “Unknown” and customer_rating with the median. The median is chosen for its robustness against extreme outliers.
Categorical Standardization Loop: Utilizing a single loop mechanism to perform whitespace trimming and unify text formatting across platform, order_status, and payment_method simultaneously.
3.2 Technical Implementation of Data Cleaning
# ============================================================
# SECTION C – DATA CLEANING
# ============================================================
cat("=== SECTION C: DATA CLEANING ===\n\n")
df_clean <- df_utama
# ============================================================
# TASK 2: Price / Sales Value Cleaning — ALL 4 NUMERIC COLUMNS
# Logic: Remove 'Rp' prefix, strip thousand-separator dots,
# convert to numeric, force any value < 0 to 0.
# ============================================================
cat("\n--- Task 2: Currency Cleaning (4 Numeric Columns) ---\n")
# Reusable helper function
clean_currency_vec <- function(vec) {
result <- vector("numeric", length(vec))
for (i in seq_along(vec)) {
val <- as.character(vec[i])
if (is.na(val) || trimws(val) == "" || toupper(trimws(val)) == "NA") {
result[i] <- NA_real_
next
}
# Remove 'Rp' prefix and thousand-separator dots
if (grepl("Rp", trimws(val), ignore.case = TRUE)) {
val <- gsub("Rp\\s*", "", val, ignore.case = TRUE)
val <- gsub("\\.", "", val)
val <- gsub(",", ".", val)
}
num <- suppressWarnings(as.numeric(trimws(val)))
# IF: enforce non-negative — any value < 0 becomes 0
if (!is.na(num) && num < 0) {
result[i] <- 0
} else {
result[i] <- ifelse(is.na(num), NA_real_, num)
}
}
return(result)
}
# MANDATORY LOOP: iterate over all 4 financial columns
numeric_cols <- c("unit_price", "gross_sales", "net_sales", "discount_value")
for (col in numeric_cols) {
df_clean[[col]] <- clean_currency_vec(df_clean[[col]])
cat(sprintf(" ✔ %-20s → numeric | min: %s | max: %s\n",
col,
format(round(min(df_clean[[col]], na.rm = TRUE)), big.mark = ","),
format(round(max(df_clean[[col]], na.rm = TRUE)), big.mark = ",")))
}
# Interactive summary table for all 4 cleaned numeric columns
num_summary <- data.frame(
Column = numeric_cols,
Min = sapply(numeric_cols, function(c) round(min(df_clean[[c]], na.rm = TRUE))),
Q1 = sapply(numeric_cols, function(c) round(quantile(df_clean[[c]], 0.25, na.rm = TRUE))),
Median = sapply(numeric_cols, function(c) round(median(df_clean[[c]], na.rm = TRUE))),
Mean = sapply(numeric_cols, function(c) round(mean(df_clean[[c]], na.rm = TRUE))),
Max = sapply(numeric_cols, function(c) round(max(df_clean[[c]], na.rm = TRUE))),
NA_Count = sapply(numeric_cols, function(c) sum(is.na(df_clean[[c]]))),
row.names = NULL
)
datatable(num_summary,
caption = "All 4 Numeric Columns — Summary After Currency Cleaning",
options = list(pageLength = 4, dom = 'ftp', scrollX = TRUE),
rownames = FALSE)
# ============================================================
# TASK 3: Missing Value Handling (MANDATORY IF)
# ============================================================
cat("\n--- Task 3: Missing Value Handling ---\n")
# Logic: Median is chosen as the default for ratings because it is robust against outliers.
rating_median <- median(suppressWarnings(as.numeric(df_clean$customer_rating)), na.rm = TRUE)
for (i in 1:nrow(df_clean)) {
if (is.na(df_clean$payment_method[i]) || trimws(as.character(df_clean$payment_method[i])) == "") {
df_clean$payment_method[i] <- "Unknown"
}
if (is.na(df_clean$customer_rating[i]) || trimws(as.character(df_clean$customer_rating[i])) == "") {
df_clean$customer_rating[i] <- rating_median
}
}
df_clean$customer_rating <- suppressWarnings(as.numeric(df_clean$customer_rating))
missing_tbl <- data.frame(
Column = c("payment_method", "customer_rating"),
Missing_After = c(
sum(is.na(df_clean$payment_method) | trimws(df_clean$payment_method) == ""),
sum(is.na(df_clean$customer_rating))
),
Method = c("Filled with 'Unknown'",
paste0("Filled with Median (", rating_median, ")"))
)
kable(missing_tbl, format = "markdown", align = "l",
caption = "Missing Value Handling — Result & Logic")
# ============================================================
# TASK 1 + 4 + 5: MANDATORY LOOPING — 4 Categorical Columns Simultaneously
# ============================================================
cat("\n--- Task 1 & 4 & 5: Loop-based Categorical Standardization ---\n")
# Logic: Using a single loop to process 4 columns at once is more efficient and ensures consistency.
for (i in 1:nrow(df_clean)) {
# Column 1: platform
val_platform <- trimws(tolower(as.character(df_clean$platform[i])))
if (val_platform == "shopee") {
df_clean$platform[i] <- "Shopee"
} else if (val_platform %in% c("tokped", "tokopedia")) {
df_clean$platform[i] <- "Tokopedia"
} else if (val_platform == "blibli") {
df_clean$platform[i] <- "Blibli"
} else if (val_platform == "lazada") {
df_clean$platform[i] <- "Lazada"
} else if (val_platform == "tiktok shop") {
df_clean$platform[i] <- "TikTok Shop"
}
# Column 2: order_status
val_status <- trimws(tolower(as.character(df_clean$order_status[i])))
if (val_status == "delivered" || val_status == "completed") {
df_clean$order_status[i] <- "Completed"
} else if (val_status == "cancelled" || val_status == "cancel" || val_status == "batal") {
df_clean$order_status[i] <- "Cancelled"
} else if (val_status == "on delivery") {
df_clean$order_status[i] <- "On Delivery"
} else if (val_status == "shipped") {
df_clean$order_status[i] <- "Shipped"
} else if (val_status == "retur" || val_status == "returned") {
df_clean$order_status[i] <- "Returned"
}
# Column 3: category
# Logic: Standardizes all case/format variations into 5 clean category names.
val_category <- trimws(tolower(gsub("_", " ", as.character(df_clean$category[i]))))
if (val_category == "beauty") {
df_clean$category[i] <- "Beauty"
} else if (val_category == "electronics") {
df_clean$category[i] <- "Electronics"
} else if (val_category == "fashion") {
df_clean$category[i] <- "Fashion"
} else if (val_category == "home living") {
df_clean$category[i] <- "Home Living"
} else if (val_category == "sports") {
df_clean$category[i] <- "Sports"
}
# Column 4: payment_method (trimws final pass)
df_clean$payment_method[i] <- trimws(as.character(df_clean$payment_method[i]))
}
# Result Table Task 1
platform_tbl <- as.data.frame(table(df_clean$platform))
names(platform_tbl) <- c("Platform", "Count")
kable(platform_tbl, format = "markdown", align = "l",
caption = "Platform Distribution After Cleaning")
# Result Table Task 4
status_tbl <- as.data.frame(table(df_clean$order_status))
names(status_tbl) <- c("Order Status", "Count")
kable(status_tbl, format = "markdown", align = "l",
caption = "Order Status Distribution After Cleaning")
# Result Table Category
category_tbl <- as.data.frame(table(df_clean$category))
names(category_tbl) <- c("Category", "Count")
kable(category_tbl, format = "markdown", align = "l",
caption = "Category Distribution After Cleaning")
# Evidence Table Task 5
loop_result <- data.frame(
Column = c("platform", "order_status", "category", "payment_method"),
Action = c("Platform name standardization (IF)",
"Order status standardization (IF)",
"Category name standardization (IF)",
"Whitespace cleaning (trimws)"),
Status = c("Done", "Done", "Done", "Done")
)
kable(loop_result, format = "markdown", align = "l",
caption = "Loop Cleaning — 4 Columns Processed Simultaneously")
# ============================================================
# FINAL SUMMARY
# ============================================================
cat("\n=== CLEANING SUMMARY ===\n")
cat("Total Rows:", nrow(df_clean), "\n\n")
missing_after <- data.frame(
Column = names(df_clean),
Missing_Count = sapply(df_clean, function(x) sum(is.na(x) | trimws(as.character(x)) == "")),
Percentage = paste0(round(sapply(df_clean, function(x) sum(is.na(x) | trimws(as.character(x)) == "")) / nrow(df_clean) * 100, 2), "%"),
row.names = NULL
)
library(DT)
datatable(missing_after, caption = "Missing Values Per Column After Cleaning",
options = list(pageLength = 10, dom = 'ftp'), rownames = FALSE)## === SECTION C: DATA CLEANING ===
##
##
## --- Task 2: Currency Cleaning (4 Numeric Columns) ---
## ✔ unit_price → numeric | min: 30,827 | max: 2,498,385
## ✔ gross_sales → numeric | min: 31,646 | max: 22,642,020
## ✔ net_sales → numeric | min: 0 | max: 18,080,224
## ✔ discount_value → numeric | min: 0 | max: 6,792,606
##
## --- Task 3: Missing Value Handling ---
| Column | Missing_After | Method |
|---|---|---|
| payment_method | 0 | Filled with ‘Unknown’ |
| customer_rating | 0 | Filled with Median (5) |
##
## --- Task 1 & 4 & 5: Loop-based Categorical Standardization ---
| Platform | Count |
|---|---|
| Blibli | 2000 |
| Lazada | 1905 |
| Shopee | 2055 |
| TikTok Shop | 1985 |
| Tokopedia | 1885 |
| Order Status | Count |
|---|---|
| Cancelled | 755 |
| Completed | 7750 |
| On Delivery | 235 |
| Returned | 585 |
| Shipped | 505 |
| Category | Count |
|---|---|
| Beauty | 1935 |
| Electronics | 1785 |
| Fashion | 2060 |
| Home Living | 2010 |
| Sports | 2040 |
| Column | Action | Status |
|---|---|---|
| platform | Platform name standardization (IF) | Done |
| order_status | Order status standardization (IF) | Done |
| category | Category name standardization (IF) | Done |
| payment_method | Whitespace cleaning (trimws) | Done |
##
## === CLEANING SUMMARY ===
## Total Rows: 9830
4 Section D
4.1 Business Logic Analysis and Column Definition
In this section, new variables are constructed based on predefined business rules. The primary focus of this task is to implement branching logic using if, else if, and else structures within a looping mechanism to classify transaction data.
Three new columns are added to the dataset:
is_high_value: Used to identify transactions with net sales exceeding 1,000,000.
order_priority: Classifies order priority into three levels (High, Medium, Low) using a nested IF method based on the sales amount.
valid_transaction: Evaluates transaction validity based on order status, where any “Cancelled” order is flagged as an invalid transaction.
4.2 Procedural Implementation Using R
# ============================================================
# SECTION D – CONDITIONAL LOGIC
# ============================================================
cat("=== SECTION D: CONDITIONAL LOGIC ===\n\n")
# Ensure net_sales is numeric (already cleaned in Section C)
df_clean$net_sales <- suppressWarnings(as.numeric(df_clean$net_sales))
# ============================================================
# TASK 1: is_high_value
# ============================================================
cat("--- Task 1: is_high_value ---\n")
# Initialize the column with NA before processing
df_clean$is_high_value <- NA_character_
for (i in 1:nrow(df_clean)) {
# Logic: Binary classification based on a 1M threshold.
# Check for NA to prevent logical errors during comparison.
if (!is.na(df_clean$net_sales[i]) && df_clean$net_sales[i] > 1000000) {
df_clean$is_high_value[i] <- "Yes"
} else {
df_clean$is_high_value[i] <- "No"
}
}
high_value_tbl <- as.data.frame(table(df_clean$is_high_value))
names(high_value_tbl) <- c("is_high_value", "Count")
kable(high_value_tbl, format = "markdown", align = "l",
caption = "is_high_value Distribution")
# ============================================================
# TASK 2: order_priority (REQUIRED Nested IF)
# ============================================================
cat("\n--- Task 2: order_priority (Nested IF) ---\n")
df_clean$order_priority <- NA_character_
for (i in 1:nrow(df_clean)) {
val <- df_clean$net_sales[i]
# Logic: Multi-tier classification using Nested IF to segment sales performance.
if (!is.na(val)) {
if (val >= 1000000) {
df_clean$order_priority[i] <- "High"
} else if (val >= 500000) {
# This part acts as the nested condition for values < 1,000,000
df_clean$order_priority[i] <- "Medium"
} else {
df_clean$order_priority[i] <- "Low"
}
}
}
priority_tbl <- as.data.frame(table(df_clean$order_priority))
names(priority_tbl) <- c("order_priority", "Count")
kable(priority_tbl, format = "markdown", align = "l",
caption = "order_priority Distribution (Nested IF)")
# ============================================================
# TASK 3: valid_transaction
# ============================================================
cat("\n--- Task 3: valid_transaction ---\n")
df_clean$valid_transaction <- NA_character_
for (i in 1:nrow(df_clean)) {
# Logic: Categorical validation based on transaction status strings.
# Flags "Cancelled" orders as "Invalid", otherwise "Valid".
if (!is.na(df_clean$order_status[i]) && df_clean$order_status[i] == "Cancelled") {
df_clean$valid_transaction[i] <- "Invalid"
} else {
df_clean$valid_transaction[i] <- "Valid"
}
}
valid_tbl <- as.data.frame(table(df_clean$valid_transaction))
names(valid_tbl) <- c("valid_transaction", "Count")
kable(valid_tbl, format = "markdown", align = "l",
caption = "valid_transaction Distribution")
# ============================================================
# FINAL PREVIEW – 3 New Columns (Interactive Table)
# ============================================================
cat("\n=== SECTION D SUMMARY – New Columns Added ===\n")
summary_d <- data.frame(
Column = c("is_high_value", "order_priority", "valid_transaction"),
Logic = c("net_sales > 1,000,000 → Yes / No",
">= 1M → High | >= 500K → Medium | < 500K → Low",
"order_status = Cancelled → Invalid / Valid"),
Type = c("Binary IF", "Nested IF (3 levels)", "Binary IF"),
Unique_Values = c(
paste(sort(unique(df_clean$is_high_value)), collapse = " / "),
paste(sort(unique(df_clean$order_priority)), collapse = " / "),
paste(sort(unique(df_clean$valid_transaction)), collapse = " / ")
)
)
kable(summary_d, format = "markdown", align = "l",
caption = "Section D – New Columns & Logic Applied")
cat("\nSample Output (First 10 rows — Interactive):\n")
datatable(
head(df_clean[, c("order_id", "net_sales", "order_status",
"is_high_value", "order_priority", "valid_transaction")], 10),
caption = "Sample: 3 New Columns Section D",
options = list(pageLength = 10, scrollX = TRUE, dom = 'ftp'),
rownames = FALSE
)## === SECTION D: CONDITIONAL LOGIC ===
##
## --- Task 1: is_high_value ---
| is_high_value | Count |
|---|---|
| No | 6055 |
| Yes | 3775 |
##
## --- Task 2: order_priority (Nested IF) ---
| order_priority | Count |
|---|---|
| High | 3775 |
| Low | 3995 |
| Medium | 2060 |
##
## --- Task 3: valid_transaction ---
| valid_transaction | Count |
|---|---|
| Invalid | 755 |
| Valid | 9075 |
##
## === SECTION D SUMMARY – New Columns Added ===
| Column | Logic | Type | Unique_Values |
|---|---|---|---|
| is_high_value | net_sales > 1,000,000 → Yes / No | Binary IF | No / Yes |
| order_priority | >= 1M → High | >= 500K → Medium | < 500K → Low | Nested IF (3 levels) | High / Low / Medium |
| valid_transaction | order_status = Cancelled → Invalid / Valid | Binary IF | Invalid / Valid |
##
## Sample Output (First 10 rows — Interactive):
5 Section E
5.1 Analytical Thinking — Business Insights from the Cleaned Dataset
The following three questions are answered using live computation
from df_clean, filtering only valid transactions
(valid_transaction == "Valid") to ensure accuracy.
# ============================================================
# SECTION E – ANALYTICAL THINKING
# ============================================================
cat("=== SECTION E: ANALYTICAL THINKING ===\n\n")
# Filter: only valid, non-duplicate transactions for accurate analysis
df_valid <- df_clean %>%
filter(valid_transaction == "Valid") %>%
distinct(order_id, .keep_all = TRUE)
cat(sprintf("Working dataset: %d valid unique transactions\n\n", nrow(df_valid)))
# ── Q1: Which platform is the most dominant? ─────────────────────────────────
cat("--- Q1: Most Dominant Platform ---\n")
platform_counts <- sort(table(df_valid$platform), decreasing = TRUE)
platform_df <- as.data.frame(platform_counts)
names(platform_df) <- c("Platform", "Transactions")
platform_df$Share_Pct <- paste0(
round(platform_df$Transactions / sum(platform_df$Transactions) * 100, 1), "%"
)
top_platform <- as.character(platform_df$Platform[1])
top_platform_n <- platform_df$Transactions[1]
datatable(platform_df,
caption = "Q1 — Transaction Count by Platform (Sorted Descending)",
options = list(pageLength = 5, dom = 'ftp'),
rownames = FALSE)
cat(sprintf(
"\nAnswer: The most dominant platform is %s with %d transactions (%s of valid transactions).\n",
top_platform, top_platform_n, platform_df$Share_Pct[1]
))
# ── Q2: Which category appears most frequently? ───────────────────────────────
cat("\n--- Q2: Most Frequent Product Category ---\n")
category_counts <- sort(table(df_valid$category), decreasing = TRUE)
category_df <- as.data.frame(category_counts)
names(category_df) <- c("Category", "Transactions")
category_df$Share_Pct <- paste0(
round(category_df$Transactions / sum(category_df$Transactions) * 100, 1), "%"
)
top_category <- as.character(category_df$Category[1])
top_category_n <- category_df$Transactions[1]
datatable(category_df,
caption = "Q2 — Transaction Count by Category (Sorted Descending)",
options = list(pageLength = 5, dom = 'ftp'),
rownames = FALSE)
cat(sprintf(
"\nAnswer: The most frequent category is %s with %d transactions (%s of valid transactions).\n",
top_category, top_category_n, category_df$Share_Pct[1]
))
# ── Q3: What is the most common transaction status? ───────────────────────────
cat("\n--- Q3: Most Common Transaction Status ---\n")
# Use full df_clean for a complete status picture (including Invalid)
status_counts <- sort(table(df_clean$order_status), decreasing = TRUE)
status_df <- as.data.frame(status_counts)
names(status_df) <- c("Order_Status", "Count")
status_df$Share_Pct <- paste0(
round(status_df$Count / sum(status_df$Count) * 100, 1), "%"
)
top_status <- as.character(status_df$Order_Status[1])
top_status_n <- status_df$Count[1]
datatable(status_df,
caption = "Q3 — Transaction Count by Order Status (Sorted Descending)",
options = list(pageLength = 5, dom = 'ftp'),
rownames = FALSE)
cat(sprintf(
"\nAnswer: The most common transaction status is %s with %d records (%s of all transactions).\n",
top_status, top_status_n, status_df$Share_Pct[1]
))
# ── Final Dataset Snapshot ────────────────────────────────────────────────────
cat("\n=== FINAL DATASET SNAPSHOT ===\n")
snapshot <- data.frame(
Metric = c(
"Total Rows in df_clean", "Total Columns",
"Valid Transactions", "Invalid (Cancelled)",
"High-Value Orders (net_sales > 1M)",
"Most Dominant Platform", "Top Product Category", "Top Order Status"
),
Value = c(
nrow(df_clean), ncol(df_clean),
sum(df_clean$valid_transaction == "Valid", na.rm = TRUE),
sum(df_clean$valid_transaction == "Invalid", na.rm = TRUE),
sum(df_clean$is_high_value == "Yes", na.rm = TRUE),
top_platform, top_category, top_status
)
)
datatable(snapshot,
caption = "Final Dataset Snapshot — Section E Summary",
options = list(pageLength = 10, dom = 'ftp'),
rownames = FALSE)## === SECTION E: ANALYTICAL THINKING ===
##
## Working dataset: 1815 valid unique transactions
##
## --- Q1: Most Dominant Platform ---
##
## Answer: The most dominant platform is Shopee with 375 transactions (20.7% of valid transactions).
##
## --- Q2: Most Frequent Product Category ---
##
## Answer: The most frequent category is Fashion with 386 transactions (21.3% of valid transactions).
##
## --- Q3: Most Common Transaction Status ---
##
## Answer: The most common transaction status is Completed with 7750 records (78.8% of all transactions).
##
## === FINAL DATASET SNAPSHOT ===