This section focuses on automating the collection and integration of data from multiple sources. To meet the assignment criteria, five different file formats—CSV, Excel, JSON, TXT, and XML—are processed.
The workflow is organized into three main stages:
# ============================================================
# SECTION A – DATA COLLECTION USING PROGRAMMING
# ============================================================
# Loading essential libraries for data manipulation and reporting
library(readr)
library(readxl)
library(jsonlite)
library(XML)
library(dplyr)
library(knitr)
library(DT)
# Data Acquisition (Reading multiple file formats)
df_csv <- read_csv("ecommerce.csv", show_col_types = FALSE)
df_xlsx <- read_excel("ecommerce.xlsx", sheet = "ecommerce_raw")
df_json <- fromJSON("ecommerce.json")
if (!is.data.frame(df_json)) { df_json <- as.data.frame(df_json) }
df_txt <- read_delim("ecommerce.txt", delim = "|", show_col_types = FALSE)
xml_data <- xmlParse("ecommerce.xml")
df_xml <- xmlToDataFrame(nodes = getNodeSet(xml_data, "//Record"), stringsAsFactors = FALSE)
# ============================================================
# Automation, Validation, and Integration
# ============================================================
file_list <- list(CSV = df_csv, XLSX = df_xlsx, JSON = df_json, TXT = df_txt, XML = df_xml)
# TASK REQUIREMENT: Display rows, columns, and column names per file
for (file_name in names(file_list)) {
df_temp <- file_list[[file_name]]
}
# DATA SUMMARY (Interactive Table)
ringkasan_df <- data.frame(
File = names(file_list),
Rows = sapply(file_list, nrow),
Columns = sapply(file_list, ncol),
Column_Names = sapply(file_list, function(df) paste(names(df), collapse = ", ")),
row.names = NULL
)
datatable(ringkasan_df,
caption = "Data Summary per File (Rows, Columns & Column Names)",
options = list(pageLength = 5, scrollX = TRUE, dom = 'ftp'),
rownames = FALSE)# COLUMN VALIDATION using FOR LOOP + IF/ELSE
reference_cols <- names(df_csv)
files_ready <- list()
validasi_df <- data.frame(File = character(), Rows = integer(),
Columns = integer(), Status = character(),
stringsAsFactors = FALSE)
for (file_name in names(file_list)) {
df_temp <- file_list[[file_name]]
ref_norm <- tolower(trimws(reference_cols))
curr_norm <- tolower(trimws(names(df_temp)))
# IF/ELSE: check whether column structure matches the reference
if (identical(sort(ref_norm), sort(curr_norm))) {
status <- "Ready to merge ✓"
files_ready[[file_name]] <- df_temp
} else {
status <- "Need adjustment ✗"
}
validasi_df <- rbind(validasi_df, data.frame(
File = file_name,
Rows = nrow(df_temp),
Columns = ncol(df_temp),
Status = status,
stringsAsFactors = FALSE
))
}
datatable(validasi_df,
caption = "Column Structure Validation per File",
options = list(pageLength = 5, dom = 'ftp'),
rownames = FALSE)# Data Merging into df_utama
if (length(files_ready) > 0) {
standardize_cols <- function(df, ref_cols) {
df <- df[, ref_cols[ref_cols %in% names(df)], drop = FALSE]
df[] <- lapply(df, as.character)
return(df)
}
files_std <- lapply(files_ready, standardize_cols, ref_cols = reference_cols)
for (fname in names(files_std)) { files_std[[fname]]$source_file <- fname }
# Final Consolidated Dataset
# Create df_utama
df_utama <- bind_rows(files_std) %>%
distinct(order_id, source_file, .keep_all = TRUE)
# Interactive preview of merged dataset
datatable(head(df_utama, 10),
caption = "Preview of df_utama — Top 10 Rows (All Columns)",
options = list(pageLength = 5, scrollX = TRUE),
rownames = FALSE)
} else {
cat("\n[ERROR] No datasets matched requirements. Merging cancelled.\n")
}Following the data integration in Section A, an evaluation of df_utama is conducted to ensure analytical quality and accuracy. This process focuses on three key areas:
Dataset Overview: Validating final dimensions and inspecting data types for each column to ensure consistent variable structures.
Data Integrity Check: Detecting data irregularities, including identifying missing values (system NA and empty strings) and duplicate rows to prevent double-counting.
Quality Issue Identification: Critically analyzing variable format anomalies and technical risks arising from merging data from diverse source formats.
library(DT)
# ============================================================
# SECTION B – DATA HANDLING
# ============================================================
# Structural Overview & Data Types
# Map each column to its primitive R class to identify storage types.
tipe_df <- data.frame(
Column = names(df_utama),
Data_Type = sapply(df_utama, class),
row.names = NULL
)
datatable(
tipe_df,
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-size: 13px; padding: 8px 0 4px 0;",
htmltools::strong(paste0(
"Task 1 — Data Type per Column in df_utama",
" | Total Rows: ", nrow(df_utama),
" · Total Columns: ", ncol(df_utama)
))
),
options = list(pageLength = 10, scrollX = TRUE),
rownames = FALSE
)# Missing Values & Duplicate Rows
# Detect both explicit NAs and empty strings/whitespace as missing values.
missing_df <- data.frame(
Column = names(df_utama),
Missing_Count = sapply(df_utama, function(x) sum(is.na(x) | trimws(as.character(x)) == "")),
row.names = NULL
)
# Calculate relative impact of missingness per variable.
missing_df$Percentage <- paste0(
round(missing_df$Missing_Count / nrow(df_utama) * 100, 2), "%"
)
total_missing <- sum(missing_df$Missing_Count)
dup_count <- sum(duplicated(df_utama))
datatable(
missing_df,
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-size: 13px; padding: 8px 0 4px 0;",
htmltools::strong(paste0(
"Task 2 — Missing Values per Column",
" | Total Missing: ", total_missing,
" · Duplicate Rows: ", dup_count
))
),
options = list(pageLength = 10, scrollX = TRUE),
rownames = FALSE
)# Display a preview of duplicated records if any exist for verification.
if (dup_count > 0) {
datatable(
head(df_utama[duplicated(df_utama), ], 5),
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-size: 13px; padding: 8px 0 4px 0;",
htmltools::strong(paste0(
"Task 2 — Sample Duplicate Rows | Total: ", dup_count, " duplicate rows found"
))
),
options = list(pageLength = 5, scrollX = TRUE),
rownames = FALSE
)
}# Data Quality Insights
# Consolidate metadata findings into a human-readable quality report.
most_affected_col <- missing_df$Column[which.max(missing_df$Missing_Count)]
most_affected_n <- max(missing_df$Missing_Count)
char_cols <- sum(sapply(df_utama, is.character))
source_summary <- table(df_utama$source_file)
quality_issues <- data.frame(
No = 1:4,
Issue = c(
"Missing Values",
"Duplicate Rows",
"Type Casting Issues",
"Source Fragmentation"
),
Description = c(
paste0(
"Identified ", total_missing, " NA/null entries. ",
"Highest missingness in: '", most_affected_col,
"' (", most_affected_n, " counts)."
),
paste0(
"Detected ", dup_count, " redundant rows. ",
"May cause statistical bias or inflated metrics."
),
paste0(
char_cols, " of ", ncol(df_utama), " columns are 'character'. ",
"Numeric fields like 'price/quantity' require conversion for calculation."
),
paste0(
"Merged from ", length(source_summary), " distinct files. ",
"Risk of heterogeneous encoding and delimiter mismatches."
)
),
row.names = NULL
)
datatable(
quality_issues,
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-size: 13px; padding: 8px 0 4px 0;",
htmltools::strong("Task 3 — Data Quality Issues Summary")
),
options = list(
pageLength = 10,
scrollX = TRUE,
dom = 'ftp',
columnDefs = list(
list(width = '8%', targets = 0),
list(width = '25%', targets = 1),
list(width = '67%', targets = 2)
)
),
rownames = FALSE
) %>%
formatStyle(
columns = 'No',
textAlign = 'center',
fontWeight = 'bold'
) %>%
formatStyle(
columns = 'Issue',
fontWeight = 'bold',
textAlign = 'center'
)This stage focuses on executing the cleaning of df_utama using programmatic logic to ensure the dataset is analysis-ready. The process covers three key areas:
Price Normalization: Removing currency formats (Rp) and enforcing non-negative constraints to achieve mathematical consistency.
Missing Value Handling: Filling empty payment_method entries with “Unknown” and customer_rating with the median. The median is chosen for its robustness against extreme outliers.
Categorical Standardization Loop: Utilizing a single loop mechanism to perform whitespace trimming and unify text formatting across platform, order_status, and payment_method simultaneously.
# ============================================================
# SECTION C – DATA CLEANING
# ============================================================
library(DT)
library(knitr)
df_clean <- df_utama# Currency Cleaning (4 Numeric Columns)
clean_currency_vec <- function(vec) {
result <- vector("numeric", length(vec))
for (i in seq_along(vec)) {
val <- as.character(vec[i])
# Handle NA/empty strings
if (is.na(val) || trimws(val) == "" || toupper(trimws(val)) == "NA") {
result[i] <- NA_real_
next
}
# Convert IDR format (Rp 10.000,00) to standard numeric
if (grepl("Rp", trimws(val), ignore.case = TRUE)) {
val <- gsub("Rp\\s*", "", val, ignore.case = TRUE)
val <- gsub("\\.", "", val)
val <- gsub(",", ".", val)
}
num <- suppressWarnings(as.numeric(trimws(val)))
# Set negative values to 0; keep others as numeric
if (!is.na(num) && num < 0) {
result[i] <- 0
} else {
result[i] <- ifelse(is.na(num), NA_real_, num)
}
}
return(result)
}
numeric_cols <- c("unit_price", "gross_sales", "net_sales", "discount_value")
for (col in numeric_cols) {
df_clean[[col]] <- clean_currency_vec(df_clean[[col]])
}
num_summary <- data.frame(
Column = numeric_cols,
Min = sapply(numeric_cols, function(c) format(round(min(df_clean[[c]], na.rm = TRUE)), big.mark = ",")),
Q1 = sapply(numeric_cols, function(c) format(round(quantile(df_clean[[c]], 0.25, na.rm = TRUE)), big.mark = ",")),
Median = sapply(numeric_cols, function(c) format(round(median(df_clean[[c]], na.rm = TRUE)), big.mark = ",")),
Mean = sapply(numeric_cols, function(c) format(round(mean(df_clean[[c]], na.rm = TRUE)), big.mark = ",")),
Max = sapply(numeric_cols, function(c) format(round(max(df_clean[[c]], na.rm = TRUE)), big.mark = ",")),
NA_Count = sapply(numeric_cols, function(c) sum(is.na(df_clean[[c]]))),
row.names = NULL
)
datatable(
num_summary,
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-size: 13px; padding: 8px 0 4px 0;",
htmltools::strong("Task 2 — Summary After Currency Cleaning")
),
options = list(pageLength = 4, dom = 'ftp', scrollX = TRUE),
rownames = FALSE
)# Missing Value Handling
# Use median for rating imputation to mitigate outlier impact
rating_median <- median(suppressWarnings(as.numeric(df_clean$customer_rating)), na.rm = TRUE)
for (i in 1:nrow(df_clean)) {
# Fill missing categorical with 'Unknown'
if (is.na(df_clean$payment_method[i]) || trimws(as.character(df_clean$payment_method[i])) == "") {
df_clean$payment_method[i] <- "Unknown"
}
# Fill missing numerical rating with median
if (is.na(df_clean$customer_rating[i]) || trimws(as.character(df_clean$customer_rating[i])) == "") {
df_clean$customer_rating[i] <- rating_median
}
}
df_clean$customer_rating <- suppressWarnings(as.numeric(df_clean$customer_rating))
missing_tbl <- data.frame(
Column = c("payment_method", "customer_rating"),
Missing_After = c(
sum(is.na(df_clean$payment_method) | trimws(df_clean$payment_method) == ""),
sum(is.na(df_clean$customer_rating))
),
Method = c(
"Filled with 'Unknown'",
paste0("Filled with Median (", rating_median, ")")
)
)
datatable(
missing_tbl,
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-size: 13px; padding: 8px 0 4px 0;",
htmltools::strong("Task 3 — Missing Value Handling Result")
),
options = list(pageLength = 5, dom = 'ftp', scrollX = TRUE),
rownames = FALSE
)# Categorical Standardization via Loop
for (i in 1:nrow(df_clean)) {
# Standardization: platform
val_platform <- trimws(tolower(as.character(df_clean$platform[i])))
if (val_platform == "shopee") {
df_clean$platform[i] <- "Shopee"
} else if (val_platform %in% c("tokped", "tokopedia")) {
df_clean$platform[i] <- "Tokopedia"
} else if (val_platform == "blibli") {
df_clean$platform[i] <- "Blibli"
} else if (val_platform == "lazada") {
df_clean$platform[i] <- "Lazada"
} else if (val_platform == "tiktok shop") {
df_clean$platform[i] <- "TikTok Shop"
}
# Standardization: order_status
val_status <- trimws(tolower(as.character(df_clean$order_status[i])))
if (val_status == "delivered" || val_status == "completed") {
df_clean$order_status[i] <- "Completed"
} else if (val_status %in% c("cancelled", "cancel", "batal")) {
df_clean$order_status[i] <- "Cancelled"
} else if (val_status == "on delivery") {
df_clean$order_status[i] <- "On Delivery"
} else if (val_status == "shipped") {
df_clean$order_status[i] <- "Shipped"
} else if (val_status %in% c("retur", "returned")) {
df_clean$order_status[i] <- "Returned"
}
# Standardization: category
val_category <- trimws(tolower(gsub("_", " ", as.character(df_clean$category[i]))))
if (val_category == "beauty") {
df_clean$category[i] <- "Beauty"
} else if (val_category == "electronics") {
df_clean$category[i] <- "Electronics"
} else if (val_category == "fashion") {
df_clean$category[i] <- "Fashion"
} else if (val_category == "home living") {
df_clean$category[i] <- "Home Living"
} else if (val_category == "sports") {
df_clean$category[i] <- "Sports"
}
# Clean whitespace for payment_method
df_clean$payment_method[i] <- trimws(as.character(df_clean$payment_method[i]))
}
# Platform Table
platform_tbl <- as.data.frame(table(df_clean$platform))
names(platform_tbl) <- c("Platform", "Count")
datatable(
platform_tbl,
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-size: 13px; padding: 8px 0 4px 0;",
htmltools::strong("Task 1 — Platform Distribution After Cleaning")
),
options = list(pageLength = 10, dom = 'ftp', scrollX = TRUE),
rownames = FALSE
)# Order Status Table
status_tbl <- as.data.frame(table(df_clean$order_status))
names(status_tbl) <- c("Order_Status", "Count")
datatable(
status_tbl,
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-size: 13px; padding: 8px 0 4px 0;",
htmltools::strong("Task 4 — Order Status Distribution After Cleaning")
),
options = list(pageLength = 10, dom = 'ftp', scrollX = TRUE),
rownames = FALSE
)# Category Table
category_tbl <- as.data.frame(table(df_clean$category))
names(category_tbl) <- c("Category", "Count")
datatable(
category_tbl,
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-size: 13px; padding: 8px 0 4px 0;",
htmltools::strong("Task 4 — Category Distribution After Cleaning")
),
options = list(pageLength = 10, dom = 'ftp', scrollX = TRUE),
rownames = FALSE
)# Summary of batch categorical cleaning performed in the loop
loop_result <- data.frame(
Column = c("platform", "order_status", "category", "payment_method"),
Action = c(
"Standardized names (IF/Else)",
"Mapped synonyms (IF/Else)",
"Cleaned underscores (gsub)",
"Trimmed whitespace (trimws)"
),
Status = c("Done", "Done", "Done", "Done")
)
datatable(
loop_result,
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-size: 13px; padding: 8px 0 4px 0;",
htmltools::strong("Task 5 — Loop-based Cleaning Process Summary")
),
options = list(pageLength = 10, dom = 'ftp', scrollX = TRUE),
rownames = FALSE
)# Final Audit: Missing Values After Cleaning
missing_after <- data.frame(
Column = names(df_clean),
Missing_Count = sapply(df_clean, function(x) sum(is.na(x) | trimws(as.character(x)) == "")),
Percentage = paste0(
round(sapply(df_clean, function(x) sum(is.na(x) | trimws(as.character(x)) == "")) /
nrow(df_clean) * 100, 2), "%"),
row.names = NULL
)
datatable(
missing_after,
caption = htmltools::tags$caption(
style = "caption-side: top; text-align: left; font-size: 13px; padding: 8px 0 4px 0;",
htmltools::strong(paste0(
"Cleaning Summary — Missing Values Post-Cleaning",
" | Total Rows: ", nrow(df_clean)
))
),
options = list(pageLength = 10, dom = 'ftp', scrollX = TRUE),
rownames = FALSE
)In this section, new variables are constructed based on predefined business rules. The primary focus of this task is to implement branching logic using if, else if, and else structures within a looping mechanism to classify transaction data.
Three new columns are added to the dataset:
is_high_value: Used to identify transactions with net sales exceeding 1,000,000.
order_priority: Classifies order priority into three levels (High, Medium, Low) using a nested IF method based on the sales amount.
valid_transaction: Evaluates transaction validity based on order status, where any “Cancelled” order is flagged as an invalid transaction.
# ============================================================
# SECTION D – CONDITIONAL LOGIC
# ============================================================
# Ensure net_sales is numeric (already cleaned in Section C)
df_clean$net_sales <- suppressWarnings(as.numeric(df_clean$net_sales))
# ============================================================
# TASK 1: is_high_value
# ============================================================
# Initialize the column with NA before processing
df_clean$is_high_value <- NA_character_
for (i in 1:nrow(df_clean)) {
# Logic: Binary classification based on a 1M threshold.
# Check for NA to prevent logical errors during comparison.
if (!is.na(df_clean$net_sales[i]) && df_clean$net_sales[i] > 1000000) {
df_clean$is_high_value[i] <- "Yes"
} else {
df_clean$is_high_value[i] <- "No"
}
}
high_value_tbl <- as.data.frame(table(df_clean$is_high_value))
names(high_value_tbl) <- c("is_high_value", "Count")
kable(high_value_tbl, format = "markdown", align = "l",
caption = "is_high_value Distribution")
# ============================================================
# TASK 2: order_priority (REQUIRED Nested IF)
# ============================================================
df_clean$order_priority <- NA_character_
for (i in 1:nrow(df_clean)) {
val <- df_clean$net_sales[i]
# Logic: Multi-tier classification using Nested IF to segment sales performance.
if (!is.na(val)) {
if (val >= 1000000) {
df_clean$order_priority[i] <- "High"
} else if (val >= 500000) {
# This part acts as the nested condition for values < 1,000,000
df_clean$order_priority[i] <- "Medium"
} else {
df_clean$order_priority[i] <- "Low"
}
}
}
priority_tbl <- as.data.frame(table(df_clean$order_priority))
names(priority_tbl) <- c("order_priority", "Count")
kable(priority_tbl, format = "markdown", align = "l",
caption = "order_priority Distribution (Nested IF)")
# ============================================================
# TASK 3: valid_transaction
# ============================================================
df_clean$valid_transaction <- NA_character_
for (i in 1:nrow(df_clean)) {
# Logic: Categorical validation based on transaction status strings.
# Flags "Cancelled" orders as "Invalid", otherwise "Valid".
if (!is.na(df_clean$order_status[i]) && df_clean$order_status[i] == "Cancelled") {
df_clean$valid_transaction[i] <- "Invalid"
} else {
df_clean$valid_transaction[i] <- "Valid"
}
}
valid_tbl <- as.data.frame(table(df_clean$valid_transaction))
names(valid_tbl) <- c("valid_transaction", "Count")
kable(valid_tbl, format = "markdown", align = "l",
caption = "valid_transaction Distribution")
# ============================================================
# FINAL PREVIEW – 3 New Columns (Interactive Table)
# ============================================================
summary_d <- data.frame(
Column = c("is_high_value", "order_priority", "valid_transaction"),
Logic = c("net_sales > 1,000,000 → Yes / No",
">= 1M → High | >= 500K → Medium | < 500K → Low",
"order_status = Cancelled → Invalid / Valid"),
Type = c("Binary IF", "Nested IF (3 levels)", "Binary IF"),
Unique_Values = c(
paste(sort(unique(df_clean$is_high_value)), collapse = " / "),
paste(sort(unique(df_clean$order_priority)), collapse = " / "),
paste(sort(unique(df_clean$valid_transaction)), collapse = " / ")
)
)
kable(summary_d, format = "markdown", align = "l",
caption = "Section D – New Columns & Logic Applied")
datatable(
head(df_clean[, c("order_id", "net_sales", "order_status",
"is_high_value", "order_priority", "valid_transaction")], 10),
caption = "Sample: 3 New Columns Section D",
options = list(pageLength = 10, scrollX = TRUE, dom = 'ftp'),
rownames = FALSE
)| is_high_value | Count |
|---|---|
| No | 6055 |
| Yes | 3775 |
| order_priority | Count |
|---|---|
| High | 3775 |
| Low | 3995 |
| Medium | 2060 |
| valid_transaction | Count |
|---|---|
| Invalid | 755 |
| Valid | 9075 |
| Column | Logic | Type | Unique_Values |
|---|---|---|---|
| is_high_value | net_sales > 1,000,000 → Yes / No | Binary IF | No / Yes |
| order_priority | >= 1M → High | >= 500K → Medium | < 500K → Low | Nested IF (3 levels) | High / Low / Medium |
| valid_transaction | order_status = Cancelled → Invalid / Valid | Binary IF | Invalid / Valid |
The following three questions are answered using live computation
from df_clean, filtering only valid transactions
(valid_transaction == "Valid") to ensure accuracy.
# ============================================================
# SECTION E – ANALYTICAL THINKING | Setup & Helpers
# ============================================================
library(ggplot2)
library(dplyr)
library(plotly)
library(scales)
library(htmltools)
# Filter valid transactions and remove duplicate Order IDs for analysis integrity
df_valid <- df_clean %>%
filter(valid_transaction == "Valid") %>%
distinct(order_id, .keep_all = TRUE)
# Light theme base
light_theme <- theme(
plot.background = element_rect(fill = "#ffffff", color = NA),
panel.background = element_rect(fill = "#ffffff", color = NA),
panel.grid.major = element_line(color = "#e5e7eb", linewidth = 0.4),
panel.grid.minor = element_blank(),
axis.text = element_text(color = "#6b7280", size = 10),
axis.title = element_text(color = "#374151", size = 11),
plot.title = element_text(color = "#111827", size = 16, face = "bold", margin = margin(b = 4)),
plot.subtitle = element_text(color = "#6b7280", size = 11, margin = margin(b = 16)),
legend.background = element_rect(fill = "#ffffff", color = NA),
legend.text = element_text(color = "#374151"),
legend.title = element_text(color = "#374151"),
plot.margin = margin(16, 20, 16, 20)
)
# Helper: HTML metric card
metric_card <- function(label, value, subtitle = "", border_color = "#3b82f6") {
htmltools::div(
style = paste0(
"background:#f9fafb;",
"border:1px solid #e5e7eb;border-top:3px solid ", border_color, ";",
"border-radius:8px;padding:18px 20px;flex:1;min-width:0;"
),
htmltools::p(label,
style = "color:#6b7280;font-size:11px;font-weight:600;letter-spacing:.08em;margin:0 0 6px 0;font-family:monospace;"
),
htmltools::h2(value,
style = "color:#111827;font-size:28px;font-weight:800;margin:0 0 4px 0;font-family:sans-serif;"
),
htmltools::p(subtitle,
style = "color:#9ca3af;font-size:12px;margin:0;font-family:sans-serif;"
)
)
}
# Helper: section header HTML
section_header <- function(tag_label, title, subtitle) {
htmltools::div(
style = "background:#f9fafb;padding:24px 24px 8px 24px;border-bottom:1px solid #e5e7eb;",
htmltools::span(tag_label,
style = "background:#e5e7eb;color:#374151;font-size:11px;font-family:monospace;padding:4px 10px;border-radius:4px;"
),
htmltools::h1(title,
style = "color:#111827;font-size:28px;font-weight:800;margin:12px 0 4px 0;font-family:sans-serif;"
),
htmltools::p(subtitle,
style = "color:#6b7280;font-size:13px;margin:0;font-family:sans-serif;"
)
)
}# ─────────────────────────────────────────────────────────────────────────────
# Q1: PLATFORM DOMINANCE
# ─────────────────────────────────────────────────────────────────────────────
platform_df <- df_valid %>%
count(platform, name = "Transactions") %>%
arrange(desc(Transactions)) %>%
mutate(
Share_Pct = Transactions / sum(Transactions) * 100,
Share_Label = paste0(round(Share_Pct, 1), "%"),
platform = factor(platform, levels = rev(platform))
)
top_platform <- as.character(platform_df$platform[1])
top_n <- platform_df$Transactions[1]
top_pct <- platform_df$Share_Label[1]
total_tx <- sum(platform_df$Transactions)
avg_per_plt <- round(total_tx / nrow(platform_df))
platform_colors <- c("#6366f1","#3b82f6","#f59e0b","#10b981","#ef4444")
q1_cards <- htmltools::div(
style = "display:flex;gap:12px;padding:16px 24px;background:#ffffff;flex-wrap:nowrap;",
metric_card("DOMINANT PLATFORM", top_platform, paste0("#1 of ", nrow(platform_df), " platforms"), "#6366f1"),
metric_card("TOP TRANSACTIONS", format(top_n, big.mark=","), paste0(top_pct, " of all transactions"), "#10b981"),
metric_card("TOTAL TRANSACTIONS", format(total_tx, big.mark=","), "across all platforms", "#3b82f6"),
metric_card("AVG / PLATFORM", format(avg_per_plt, big.mark=","),"transactions per platform", "#f59e0b")
)
p_q1 <- ggplot(platform_df, aes(x = Transactions, y = platform, fill = platform)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = paste0(Transactions, " · ", Share_Label)),
hjust = -0.05, color = "#111827", size = 3.8, fontface = "bold") +
geom_vline(xintercept = avg_per_plt, color = "#6b7280",
linetype = "dashed", linewidth = 0.5, alpha = 0.6) +
annotate("text", x = avg_per_plt + (max(platform_df$Transactions) * 0.01),
y = nrow(platform_df) + 0.4, label = "average",
color = "#6b7280", size = 3.2, hjust = 0) +
scale_fill_manual(values = setNames(platform_colors, levels(platform_df$platform))) +
scale_x_continuous(expand = expansion(mult = c(0, 0.18)), labels = comma) +
labs(title = paste0(top_platform, " leads with ", top_pct, " of all transactions"),
x = "Transaction Count", y = NULL) +
light_theme
q1_plotly <- ggplotly(p_q1, tooltip = c("x","y")) %>%
layout(
paper_bgcolor = "#ffffff", plot_bgcolor = "#ffffff",
font = list(color = "#374151"),
margin = list(l = 80, r = 40, t = 40, b = 40),
xaxis = list(gridcolor = "#e5e7eb"),
yaxis = list(gridcolor = "#e5e7eb")
)
htmltools::browsable(
htmltools::div(
style = "background:#ffffff;border:1px solid #e5e7eb;border-radius:12px;overflow:hidden;margin-bottom:32px;",
section_header("Q1 · ANALYTICAL THINKING", "Platform Dominance",
"Transaction distribution by sales platform"),
q1_cards,
htmltools::div(style = "padding:8px 24px 24px;background:#ffffff;",
htmltools::tagList(q1_plotly)
)
)
)# ─────────────────────────────────────────────────────────────────────────────
# Q2: PRODUCT CATEGORY DISTRIBUTION
# ─────────────────────────────────────────────────────────────────────────────
category_df <- df_valid %>%
count(category, name = "Transactions") %>%
arrange(desc(Transactions)) %>%
mutate(
Share_Pct = Transactions / sum(Transactions) * 100,
Share_Label = paste0(round(Share_Pct, 1), "%"),
category = factor(category, levels = rev(category))
)
top_cat <- as.character(category_df$category[which.max(category_df$Transactions)])
top_cat_n <- max(category_df$Transactions)
top_cat_pct <- category_df$Share_Label[which.max(category_df$Transactions)]
n_cats <- nrow(category_df)
med_count <- median(category_df$Transactions)
cat_colors <- c("#6366f1","#10b981","#f59e0b","#3b82f6","#ef4444")
q2_cards <- htmltools::div(
style = "display:flex;gap:12px;padding:16px 24px;background:#ffffff;flex-wrap:nowrap;",
metric_card("TOP CATEGORY", top_cat, paste0(top_cat_pct, " of total"), "#6366f1"),
metric_card("TOTAL CATEGORIES", as.character(n_cats), "product categories", "#10b981"),
metric_card("HIGHEST COUNT", format(top_cat_n, big.mark=","), top_cat, "#3b82f6"),
metric_card("MEDIAN COUNT", format(med_count, big.mark=","), "per category", "#f59e0b")
)
# Donut chart for categorical share comparison
donut_df <- category_df %>% arrange(desc(Transactions))
p2_donut_ly <- plot_ly(
donut_df,
labels = ~as.character(category),
values = ~Transactions,
type = "pie",
hole = 0.55,
marker = list(colors = cat_colors,
line = list(color = "#ffffff", width = 2)),
textinfo = "percent",
textfont = list(color = "#ffffff", size = 12),
hovertemplate = "%{label}<br>%{value} transactions<br>%{percent}<extra></extra>"
) %>%
layout(
paper_bgcolor = "#ffffff", plot_bgcolor = "#ffffff",
showlegend = FALSE,
annotations = list(list(
text = paste0(top_cat_pct, "<br>", top_cat),
x = 0.5, y = 0.5, xref = "paper", yref = "paper",
showarrow = FALSE,
font = list(color = "#111827", size = 13, family = "sans-serif")
)),
margin = list(l = 10, r = 10, t = 10, b = 10)
)
# Bar chart (horizontal)
p_cat_bar <- ggplot(category_df, aes(x = Transactions, y = category, fill = category)) +
geom_col(width = 0.65, show.legend = FALSE) +
geom_text(aes(label = Transactions), hjust = -0.2,
color = "#111827", size = 3.8, fontface = "bold") +
scale_fill_manual(values = setNames(cat_colors, levels(category_df$category))) +
scale_x_continuous(expand = expansion(mult = c(0, 0.15)), labels = comma) +
labs(x = "Transaction Count", y = NULL,
title = paste0(top_cat, " leads with ", top_cat_n,
" transactions (", top_cat_pct, ")")) +
light_theme
p2_bar_ly <- ggplotly(p_cat_bar, tooltip = c("x", "y")) %>%
layout(paper_bgcolor = "#ffffff", plot_bgcolor = "#ffffff",
font = list(color = "#374151"), showlegend = FALSE,
margin = list(l = 10, r = 10, t = 40, b = 40),
xaxis = list(gridcolor = "#e5e7eb"),
yaxis = list(gridcolor = "#e5e7eb"))
htmltools::browsable(
htmltools::div(
style = "background:#ffffff;border:1px solid #e5e7eb;border-radius:12px;overflow:hidden;margin-bottom:32px;",
section_header("Q2 · ANALYTICAL THINKING", "Product Category Distribution",
"Transaction frequency by category — donut & ranked bar"),
q2_cards,
htmltools::div(
style = "display:flex;gap:0;padding:8px 24px 24px;background:#ffffff;align-items:center;",
htmltools::div(style = "flex:0 0 38%;",
htmltools::tagList(p2_donut_ly)
),
htmltools::div(style = "flex:0 0 62%;",
htmltools::tagList(p2_bar_ly)
)
)
)
)# ─────────────────────────────────────────────────────────────────────────────
# Q3: TRANSACTION STATUS DISTRIBUTION
# ─────────────────────────────────────────────────────────────────────────────
status_df <- df_clean %>%
count(order_status, name = "Count") %>%
arrange(desc(Count)) %>%
mutate(
Share_Pct = Count / sum(Count) * 100,
Share_Label = paste0(round(Share_Pct, 1), "%"),
order_status = factor(order_status, levels = rev(order_status))
)
top_status <- as.character(status_df$order_status[1])
top_status_n <- status_df$Count[1]
top_status_pct <- status_df$Share_Label[1]
total_all <- sum(status_df$Count)
n_valid <- sum(df_clean$valid_transaction == "Valid", na.rm = TRUE)
n_invalid <- sum(df_clean$valid_transaction == "Invalid", na.rm = TRUE)
pct_valid <- round(n_valid / total_all * 100, 1)
pct_invalid <- round(n_invalid / total_all * 100, 1)
avg_status <- round(mean(status_df$Count))
q3_cards <- htmltools::div(
style = "display:flex;gap:12px;padding:16px 24px;background:#ffffff;flex-wrap:nowrap;",
metric_card("MOST COMMON STATUS", top_status, paste0(top_status_pct, " of total"), "#10b981"),
metric_card("VALID", format(n_valid, big.mark=","), paste0(pct_valid, "% of all records"), "#10b981"),
metric_card("INVALID (CANCELLED)", format(n_invalid, big.mark=","), paste0(pct_invalid, "% of all records"), "#ef4444"),
metric_card("TOTAL RECORDS", format(total_all, big.mark=","), "all statuses", "#3b82f6")
)
status_bar_colors <- c(
"Completed" = "#10b981",
"Cancelled" = "#ef4444",
"Returned" = "#6366f1",
"Shipped" = "#3b82f6",
"On Delivery" = "#f59e0b"
)
status_df_plot <- status_df %>%
arrange(desc(Count)) %>%
mutate(order_status = factor(order_status, levels = rev(as.character(order_status))))
p_status_bar <- ggplot(status_df_plot,
aes(x = order_status, y = Count, fill = order_status)) +
geom_col(width = 0.6, show.legend = FALSE) +
geom_text(aes(label = paste0(format(Count, big.mark=","), "\n", Share_Label)),
vjust = -0.4, color = "#111827", size = 3.2, fontface = "bold") +
geom_hline(yintercept = avg_status, color = "#6b7280",
linetype = "dashed", linewidth = 0.5, alpha = 0.6) +
annotate("text", x = nrow(status_df_plot) + 0.4, y = avg_status,
label = paste0("avg ", format(avg_status, big.mark=",")),
color = "#6b7280", size = 3.2, hjust = 1) +
scale_fill_manual(values = status_bar_colors) +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
labs(title = paste0(top_status, " is the most common status · ",
pct_valid, "% of records are valid"),
x = NULL, y = "Transaction Count") +
light_theme
# Compare Valid vs Invalid data volumes
vi_df <- data.frame(
label = c("Valid", "Invalid"),
value = c(n_valid, n_invalid)
)
p3_donut_ly <- plot_ly(
vi_df,
labels = ~label,
values = ~value,
type = "pie",
hole = 0.55,
marker = list(colors = c("#10b981", "#ef4444"),
line = list(color = "#ffffff", width = 2)),
textinfo = "label+percent",
textfont = list(color = "#ffffff", size = 12),
hovertemplate = "%{label}<br>%{value:,} records<br>%{percent}<extra></extra>"
) %>%
layout(
title = list(text = "Valid vs Invalid Split",
font = list(color = "#111827", size = 13), x = 0.5),
paper_bgcolor = "#ffffff", plot_bgcolor = "#ffffff",
showlegend = FALSE,
annotations = list(list(
text = paste0(round(pct_valid), "%<br>valid"),
x = 0.5, y = 0.5, xref = "paper", yref = "paper",
showarrow = FALSE,
font = list(color = "#10b981", size = 15, family = "sans-serif")
)),
margin = list(l = 10, r = 10, t = 40, b = 10)
)
p3_bar_ly <- ggplotly(p_status_bar, tooltip = c("x", "y")) %>%
layout(paper_bgcolor = "#ffffff", plot_bgcolor = "#ffffff",
font = list(color = "#374151"), showlegend = FALSE,
margin = list(l = 40, r = 10, t = 40, b = 40),
xaxis = list(gridcolor = "#e5e7eb"),
yaxis = list(gridcolor = "#e5e7eb"))
htmltools::browsable(
htmltools::div(
style = "background:#ffffff;border:1px solid #e5e7eb;border-radius:12px;overflow:hidden;margin-bottom:32px;",
section_header("Q3 · ANALYTICAL THINKING", "Transaction Status Distribution",
"Valid vs invalid breakdown — frequency & proportion per status"),
q3_cards,
htmltools::div(
style = "display:flex;gap:0;padding:8px 24px 24px;background:#ffffff;align-items:center;",
htmltools::div(style = "flex:0 0 62%;",
htmltools::tagList(p3_bar_ly)
),
htmltools::div(style = "flex:0 0 38%;",
htmltools::tagList(p3_donut_ly)
)
)
)
)# Final Dataset Snapshot
snapshot <- data.frame(
Metric = c(
"Total Rows in df_clean", "Total Columns",
"Valid Transactions", "Invalid (Cancelled)",
"High-Value Orders (net_sales > 1M)",
"Most Dominant Platform", "Top Product Category", "Top Order Status"
),
Value = c(
nrow(df_clean), ncol(df_clean),
sum(df_clean$valid_transaction == "Valid", na.rm = TRUE),
sum(df_clean$valid_transaction == "Invalid", na.rm = TRUE),
sum(df_clean$is_high_value == "Yes", na.rm = TRUE),
top_platform, top_cat, top_status
)
)
DT::datatable(snapshot,
caption = "Final Dataset Snapshot — Section E Summary",
options = list(pageLength = 10, dom = 'ftp'),
rownames = FALSE)This section collects data from two websites with different HTML
structures using the rvest package in R.
The workflow is organized into two main stages:
Website 1 — Static HTML (Countries of the
World): All data is available in a single HTTP response, so one
read_html() call is enough. A for loop iterates through
each country element one by one. If any element is missing, an IF check
fills it in with a default value to keep the loop from
breaking.
Website 2 — Pagination & Form (Hockey
Teams): Since data is spread across multiple pages, the total
page count is detected first from page 1. The loop then runs
sequentially from the first to the last page. tryCatch() is
wrapped around each request so that if one page fails, scraping
continues on to the next without stopping entirely.
URL: https://www.scrapethissite.com/pages/simple/
Type: Static HTML | Method:rvest
url_countries <- "https://www.scrapethissite.com/pages/simple/"
page_countries <- read_html(url_countries)
# Select all country blocks using their shared CSS class
country_nodes <- page_countries %>% html_nodes("div.country")
# Initialize empty vectors — filled inside the loop below
country_name <- c()
country_capital <- c()
country_population <- c()
country_area <- c()
# Loop over every country node and pull out each field individually
for (node in country_nodes) {
name <- node %>% html_node("h3.country-name") %>% html_text(trim = TRUE)
cap_node <- node %>% html_node("span.country-capital")
capital <- if (!is.na(cap_node)) cap_node %>% html_text(trim = TRUE) else "Unknown"
pop_node <- node %>% html_node("span.country-population")
population <- ifelse(!is.na(pop_node), html_text(pop_node, trim = TRUE), NA)
area_node <- node %>% html_node("span.country-area")
area <- ifelse(!is.na(area_node), html_text(area_node, trim = TRUE), NA)
country_name <- c(country_name, name)
country_capital <- c(country_capital, capital)
country_population <- c(country_population, population)
country_area <- c(country_area, area)
}
df_countries <- data.frame(
country_name = country_name,
capital = country_capital,
population = suppressWarnings(as.numeric(country_population)),
area_sq_km = suppressWarnings(as.numeric(country_area)),
stringsAsFactors = FALSE
)
write_csv(df_countries, "countries_of_the_world.csv")Total country nodes detected : 250
Total records successfully parsed: 250 rows
Dataset exported to : countries_of_the_world.csv
| country_name | capital | population | area_sq_km |
|---|---|---|---|
| Andorra | Andorra la Vella | 84000 | 468 |
| United Arab Emirates | Abu Dhabi | 4975593 | 82880 |
| Afghanistan | Kabul | 29121286 | 647500 |
| Antigua and Barbuda | St. John’s | 86754 | 443 |
| Anguilla | The Valley | 13254 | 102 |
| Albania | Tirana | 2986952 | 28748 |
| Armenia | Yerevan | 2968000 | 29800 |
| Angola | Luanda | 13068161 | 1246700 |
| Antarctica | None | 0 | 14000000 |
| Argentina | Buenos Aires | 41343201 | 2766890 |
URL: https://www.scrapethissite.com/pages/forms/
Type: Pagination + Form/Query | Method:rvest+ page loop + GET parameters
base_url_hockey <- "https://www.scrapethissite.com/pages/forms/"
# Fetch page 1 to detect the total number of pages
page1 <- read_html(paste0(base_url_hockey, "?per_page=25"))
page_numbers <- page1 %>%
html_nodes("ul.pagination li a") %>%
html_attr("href") %>%
str_extract("page_num=\\d+") %>%
str_extract("\\d+") %>%
as.integer() %>%
na.omit()
total_pages <- ifelse(length(page_numbers) > 0, max(page_numbers), 1)
all_teams <- list()
# Outer loop — build each page URL dynamically
for (page_num in 1:total_pages) {
url_page <- paste0(base_url_hockey, "?page_num=", page_num, "&per_page=25")
# Wrap in tryCatch so a single failed page does not abort the whole run
page_html <- tryCatch(read_html(url_page), error = function(e) NULL)
if (is.null(page_html)) next
team_rows <- page_html %>% html_nodes("tr.team")
if (length(team_rows) == 0) break
for (row in team_rows) {
get_text <- function(node, sel) {
n <- node %>% html_node(sel)
if (!is.na(n)) html_text(n, trim = TRUE) else NA
}
all_teams[[length(all_teams) + 1]] <- list(
team_name = get_text(row, "td.name"),
year = get_text(row, "td.year"),
wins = get_text(row, "td.wins"),
losses = get_text(row, "td.losses"),
ot_losses = get_text(row, "td.ot-losses"),
win_pct = get_text(row, "td.pct"),
goals_for = get_text(row, "td.gf"),
goals_against = get_text(row, "td.ga")
)
}
Sys.sleep(0.3)
}
df_hockey <- bind_rows(lapply(all_teams, as.data.frame, stringsAsFactors = FALSE))
# Validate form/query feature using a keyword search
query_page <- read_html(paste0(base_url_hockey, "?q=boston&per_page=25"))
query_rows <- query_page %>% html_nodes("tr.team")
write_csv(df_hockey, "hockey_teams.csv")Total pages detected : 24
Total records successfully parsed: 582 rows
Boston query results : 21 teams found
Dataset exported to : hockey_teams.csv
| team_name | year | wins | losses | ot_losses | win_pct | goals_for | goals_against |
|---|---|---|---|---|---|---|---|
| Boston Bruins | 1990 | 44 | 24 | 0.55 | 299 | 264 | |
| Buffalo Sabres | 1990 | 31 | 30 | 0.388 | 292 | 278 | |
| Calgary Flames | 1990 | 46 | 26 | 0.575 | 344 | 263 | |
| Chicago Blackhawks | 1990 | 49 | 23 | 0.613 | 284 | 211 | |
| Detroit Red Wings | 1990 | 34 | 38 | 0.425 | 273 | 298 | |
| Edmonton Oilers | 1990 | 37 | 37 | 0.463 | 272 | 272 | |
| Hartford Whalers | 1990 | 31 | 38 | 0.388 | 238 | 276 | |
| Los Angeles Kings | 1990 | 46 | 24 | 0.575 | 340 | 254 | |
| Minnesota North Stars | 1990 | 27 | 39 | 0.338 | 256 | 266 | |
| Montreal Canadiens | 1990 | 39 | 30 | 0.487 | 273 | 249 |
Once the data is collected, each dataset is evaluated to understand its condition before moving into the cleaning stage.
Structural Overview: Row count, column count, and data types are checked early to catch potential type mismatch issues.
Integrity Check: Missing values and duplicate rows are identified and counted across both datasets.
Quality Issues: At least 3 data problems per website are documented as the basis for the cleaning strategy in the next section.
Both datasets are cleaned programmatically using a combination of loops and IF-ELSE logic.
Text Standardization: str_trim()
and str_to_title() are applied through a loop to clean up
whitespace and fix capitalization across text columns.
Type Conversion: Numeric columns are cast using
as.numeric() inside a single loop over column names, so
there is no need to repeat the same code for each column
manually.
Missing Value Handling: IF-ELSE determines the
imputation method per column type — median for numeric columns,
“Unknown” for character columns. For ot_losses in the
hockey dataset specifically, missing values are filled with 0, following
standard sports statistics convention.
Duplicate Removal: distinct() is
run at the end of the process to make sure no duplicate rows remain in
the final dataset.
# Text standardization — trim whitespace and apply proper case
text_cols_c <- c("country_name", "capital")
for (col in text_cols_c) {
df_countries[[col]] <- str_trim(df_countries[[col]])
df_countries[[col]] <- str_to_title(df_countries[[col]])
}
# Convert columns to numeric type
num_cols_c <- c("population", "area_sq_km")
for (col in num_cols_c) {
if (col %in% names(df_countries)) {
df_countries[[col]] <- suppressWarnings(as.numeric(df_countries[[col]]))
}
}
# Impute missing values using IF-ELSE per column
cleaning_log_c <- data.frame(
column = character(),
action = character(),
details = character(),
stringsAsFactors = FALSE
)
for (col in names(df_countries)) {
n_na <- sum(is.na(df_countries[[col]]))
if (n_na > 0) {
if (is.numeric(df_countries[[col]])) {
# Numeric: use median to preserve distribution shape
med_val <- median(df_countries[[col]], na.rm = TRUE)
df_countries[[col]][is.na(df_countries[[col]])] <- med_val
cleaning_log_c <- rbind(cleaning_log_c, data.frame(
column = col,
action = "Median Imputation",
details = sprintf("%d NA values filled with median = %.2f", n_na, med_val)
))
} else {
# Character: fill with "Unknown" as a safe placeholder
df_countries[[col]][is.na(df_countries[[col]])] <- "Unknown"
cleaning_log_c <- rbind(cleaning_log_c, data.frame(
column = col,
action = "Default Fill",
details = sprintf('%d NA values filled with "Unknown"', n_na)
))
}
} else {
cleaning_log_c <- rbind(cleaning_log_c, data.frame(
column = col,
action = "No Action",
details = "No missing values"
))
}
}
# Remove duplicate rows
before_c <- nrow(df_countries)
df_countries <- df_countries %>% distinct()
after_c <- nrow(df_countries)| country_name | capital | population | area_sq_km |
|---|---|---|---|
| Andorra | Andorra La Vella | 84000 | 468 |
| United Arab Emirates | Abu Dhabi | 4975593 | 82880 |
| Afghanistan | Kabul | 29121286 | 647500 |
| Antigua And Barbuda | St. John’s | 86754 | 443 |
| Anguilla | The Valley | 13254 | 102 |
| Albania | Tirana | 2986952 | 28748 |
| Armenia | Yerevan | 2968000 | 29800 |
| Angola | Luanda | 13068161 | 1246700 |
# Text standardization
text_cols_h <- c("team_name")
for (col in text_cols_h) {
df_hockey[[col]] <- str_trim(df_hockey[[col]])
df_hockey[[col]] <- str_to_title(df_hockey[[col]])
}
# Convert all stat columns to numeric via loop
num_cols_h <- c("year", "wins", "losses", "ot_losses", "win_pct", "goals_for", "goals_against")
for (col in num_cols_h) {
if (col %in% names(df_hockey)) {
df_hockey[[col]] <- str_trim(df_hockey[[col]])
df_hockey[[col]] <- suppressWarnings(as.numeric(df_hockey[[col]]))
}
}
# Impute missing values using IF-ELSE
cleaning_log_h <- data.frame(
column = character(),
action = character(),
details = character(),
stringsAsFactors = FALSE
)
for (col in names(df_hockey)) {
n_na <- sum(is.na(df_hockey[[col]]))
if (n_na > 0) {
if (is.numeric(df_hockey[[col]])) {
# NAs in ot_losses filled with 0 — hockey stats convention for older seasons
df_hockey[[col]][is.na(df_hockey[[col]])] <- 0
cleaning_log_h <- rbind(cleaning_log_h, data.frame(
column = col,
action = "Zero Imputation",
details = sprintf("%d NA values filled with 0 (hockey stats convention)", n_na)
))
} else {
df_hockey[[col]][is.na(df_hockey[[col]])] <- "Unknown"
cleaning_log_h <- rbind(cleaning_log_h, data.frame(
column = col,
action = "Default Fill",
details = sprintf('%d NA values filled with "Unknown"', n_na)
))
}
} else {
cleaning_log_h <- rbind(cleaning_log_h, data.frame(
column = col,
action = "No Action",
details = "No missing values"
))
}
}
# Validity check — wins must not be negative
invalid_wins <- sum(df_hockey$wins < 0, na.rm = TRUE)
if (invalid_wins > 0) {
df_hockey$wins[df_hockey$wins < 0] <- 0
wins_note <- sprintf("%d negative wins values corrected to 0", invalid_wins)
} else {
wins_note <- "All wins values are valid (>= 0)"
}
# Check year range validity
invalid_year <- sum(df_hockey$year < 1900 | df_hockey$year > 2030, na.rm = TRUE)
if (invalid_year > 0) {
year_note <- sprintf("%d rows have year values outside valid range [1900, 2030]", invalid_year)
} else {
year_note <- "All year values are within valid range [1900, 2030]"
}
# Remove duplicate rows
before_h <- nrow(df_hockey)
df_hockey <- df_hockey %>% distinct()
after_h <- nrow(df_hockey)| team_name | year | wins | losses | ot_losses | win_pct | goals_for | goals_against |
|---|---|---|---|---|---|---|---|
| Boston Bruins | 1990 | 44 | 24 | 0 | 0.550 | 299 | 264 |
| Buffalo Sabres | 1990 | 31 | 30 | 0 | 0.388 | 292 | 278 |
| Calgary Flames | 1990 | 46 | 26 | 0 | 0.575 | 344 | 263 |
| Chicago Blackhawks | 1990 | 49 | 23 | 0 | 0.613 | 284 | 211 |
| Detroit Red Wings | 1990 | 34 | 38 | 0 | 0.425 | 273 | 298 |
| Edmonton Oilers | 1990 | 37 | 37 | 0 | 0.463 | 272 | 272 |
| Hartford Whalers | 1990 | 31 | 38 | 0 | 0.388 | 238 | 276 |
| Los Angeles Kings | 1990 | 46 | 24 | 0 | 0.575 | 340 | 254 |
A new column data_status is added to each dataset using
nested IF-ELSE logic, applied row by row through a for loop.
The rules are straightforward:
df_countries$data_status <- NA_character_
for (i in seq_len(nrow(df_countries))) {
row <- df_countries[i, ]
has_name <- !is.na(row$country_name) && nchar(row$country_name) > 0 &&
row$country_name != "Unknown"
has_cap <- !is.na(row$capital) && nchar(row$capital) > 0 &&
row$capital != "Unknown"
has_pop <- !is.na(row$population) && row$population >= 0
has_area <- !is.na(row$area_sq_km) && row$area_sq_km >= 0
if (!has_name || !has_cap) {
df_countries$data_status[i] <- "Incomplete"
} else if (!has_pop || !has_area) {
df_countries$data_status[i] <- "Incomplete"
} else {
df_countries$data_status[i] <- "Complete"
}
}
status_table_c <- as.data.frame(table(df_countries$data_status))
names(status_table_c) <- c("status", "count")| country_name | capital | population | area_sq_km | data_status |
|---|---|---|---|---|
| Andorra | Andorra La Vella | 84000 | 468 | Complete |
| United Arab Emirates | Abu Dhabi | 4975593 | 82880 | Complete |
| Afghanistan | Kabul | 29121286 | 647500 | Complete |
| Antigua And Barbuda | St. John’s | 86754 | 443 | Complete |
| Anguilla | The Valley | 13254 | 102 | Complete |
| Albania | Tirana | 2986952 | 28748 | Complete |
| Armenia | Yerevan | 2968000 | 29800 | Complete |
| Angola | Luanda | 13068161 | 1246700 | Complete |
df_hockey$data_status <- NA_character_
for (i in seq_len(nrow(df_hockey))) {
row <- df_hockey[i, ]
has_name <- !is.na(row$team_name) && nchar(row$team_name) > 0 &&
row$team_name != "Unknown"
has_year <- !is.na(row$year) && row$year > 0
has_wins <- !is.na(row$wins)
has_losses <- !is.na(row$losses)
if (!has_name || !has_year) {
df_hockey$data_status[i] <- "Incomplete"
} else if (!has_wins || !has_losses) {
df_hockey$data_status[i] <- "Incomplete"
} else {
df_hockey$data_status[i] <- "Complete"
}
}
status_table_h <- as.data.frame(table(df_hockey$data_status))
names(status_table_h) <- c("status", "count")| team_name | year | wins | losses | ot_losses | win_pct | goals_for | goals_against | data_status |
|---|---|---|---|---|---|---|---|---|
| Boston Bruins | 1990 | 44 | 24 | 0 | 0.550 | 299 | 264 | Complete |
| Buffalo Sabres | 1990 | 31 | 30 | 0 | 0.388 | 292 | 278 | Complete |
| Calgary Flames | 1990 | 46 | 26 | 0 | 0.575 | 344 | 263 | Complete |
| Chicago Blackhawks | 1990 | 49 | 23 | 0 | 0.613 | 284 | 211 | Complete |
| Detroit Red Wings | 1990 | 34 | 38 | 0 | 0.425 | 273 | 298 | Complete |
| Edmonton Oilers | 1990 | 37 | 37 | 0 | 0.463 | 272 | 272 | Complete |
| Hartford Whalers | 1990 | 31 | 38 | 0 | 0.388 | 238 | 276 | Complete |
| Los Angeles Kings | 1990 | 46 | 24 | 0 | 0.575 | 340 | 254 | Complete |
This section draws insights from the collected and cleaned data, while also reflecting on the scraping process itself.
Easiest website: Countries of the World — one request covers everything, no pagination needed, and the HTML structure is consistent throughout.
Most challenging website: Hockey Teams — data
spans multiple pages, the page count has to be detected dynamically, and
error handling with tryCatch() is necessary to keep the
process stable.
Approach differences: A technical comparison of Static, Pagination, AJAX, and iFrame scraping methods is discussed further in the Short Report section.
Insights & recommendations: At least 3 insights and 2 recommendations are drawn from the collected and cleaned data.
# Insight 1: Countries with the largest population
top5_pop <- df_countries %>%
arrange(desc(population)) %>%
select(country_name, capital, population) %>%
head(5)
# Insight 2: Countries with the largest area
top5_area <- df_countries %>%
arrange(desc(area_sq_km)) %>%
select(country_name, capital, area_sq_km) %>%
head(5)
# Insight 3: Descriptive statistics for population
pop_mean <- round(mean(df_countries$population, na.rm = TRUE))
pop_med <- median(df_countries$population, na.rm = TRUE)
pop_max <- max(df_countries$population, na.rm = TRUE)
pop_min <- min(df_countries$population, na.rm = TRUE)
# Population bracket distribution — passed dynamically to chart3
pop <- df_countries$population[!is.na(df_countries$population)]
bracket_lt1M <- sum(pop < 1e6)
bracket_1_10M <- sum(pop >= 1e6 & pop < 10e6)
bracket_10_50M <- sum(pop >= 10e6 & pop < 50e6)
bracket_50_200M<- sum(pop >= 50e6 & pop < 200e6)
bracket_gt200M <- sum(pop >= 200e6)# Insight 1: Teams with the most wins in a single season
top5_wins <- df_hockey %>%
arrange(desc(wins)) %>%
select(team_name, year, wins, losses) %>%
head(5)
# Insight 2: Average wins per year (top 5 years)
avg_wins_year <- df_hockey %>%
group_by(year) %>%
summarise(avg_wins = round(mean(wins, na.rm = TRUE), 1), .groups = "drop") %>%
arrange(desc(avg_wins)) %>%
head(5)
# Insight 3: Teams that appear in the most seasons
most_seasons <- df_hockey %>%
group_by(team_name) %>%
summarise(total_seasons = n(), .groups = "drop") %>%
arrange(desc(total_seasons)) %>%
head(5)