Mini Case Study: E-Commerce & Web Scraping
Eksplorasi lengkap proses membaca berbagai format file, pembersihan data, rekayasa fitur, hingga web scraping multi-sumber menggunakan R.
Data Collection
Membaca dataset e-commerce dari 5 format berbeda (CSV, Excel, JSON, TXT,
XML) lalu menggabungkannya menggunakan looping dan
bind_rows().
library(tidyverse)
library(readxl)
library(jsonlite)
library(xml2)
# ── STEP 1: Set working directory ke folder file kamu ────────────
setwd("C:/Users/Adinda/OneDrive/Lampiran/UTS_PEMROGRAMAN_SAINSDATA")
# ── STEP 2: Verifikasi ───────────────────────────────────────────
cat("Working directory sekarang:\n")
cat(getwd(), "\n\n")
cat("File yang terdeteksi di folder:\n")
print(list.files())
# ── STEP 3: Baru baca file dengan fungsi yang benar ──────────────
library(tidyverse)
library(readxl)
df_csv <- read_csv("ecommerce.csv", show_col_types = FALSE)
cat("✅ CSV berhasil dibaca:", nrow(df_csv), "baris\n")
# ─── Fungsi helper untuk menampilkan info file ────────────────────
tampilkan_info <- function(nama_file, df) {
cat(sprintf("\n%s\n", strrep("=", 60)))
cat(sprintf(" 📄 FILE: %s\n", nama_file))
cat(sprintf("%s\n", strrep("=", 60)))
cat(sprintf(" ➤ Jumlah Baris : %s\n", format(nrow(df), big.mark=",")))
cat(sprintf(" ➤ Jumlah Kolom : %d\n", ncol(df)))
cat(" ➤ Nama Kolom :\n")
for (i in seq_along(names(df))) {
cat(sprintf(" %2d. %s\n", i, names(df)[i]))
}
cat("\n")
}
# ═══════════════════════════════════════════════════════════════
# 1. MEMBACA FILE CSV
# ═══════════════════════════════════════════════════════════════
df_csv <- read_csv("ecommerce.csv", show_col_types = FALSE)
tampilkan_info("ecommerce.csv", df_csv)
cat(" Preview 3 baris pertama:\n")
print(head(df_csv, 3))
# ═══════════════════════════════════════════════════════════════
# 2. MEMBACA FILE EXCEL (.xlsx)
# ═══════════════════════════════════════════════════════════════
df_xlsx <- read_excel("ecommerce.xlsx")
# FIX: Rename kolom Unnamed jika order_id tidak ada
if ("...1" %in% names(df_xlsx) && !"order_id" %in% names(df_xlsx)) {
df_xlsx <- df_xlsx %>% rename(order_id = `...1`)
} else if ("...1" %in% names(df_xlsx)) {
df_xlsx <- df_xlsx %>% select(-`...1`)
}
tampilkan_info("ecommerce.xlsx", df_xlsx)
cat(" Preview 3 baris pertama:\n")
print(head(df_xlsx, 3))
# ═══════════════════════════════════════════════════════════════
# 3. MEMBACA FILE JSON
# ═══════════════════════════════════════════════════════════════
df_json <- fromJSON("ecommerce.json")
if (!is.data.frame(df_json)) df_json <- as.data.frame(df_json)
tampilkan_info("ecommerce.json", df_json)
cat(" Preview 3 baris pertama:\n")
print(head(df_json, 3))
# ═══════════════════════════════════════════════════════════════
# 4. MEMBACA FILE TXT (auto-detect delimiter)
# ═══════════════════════════════════════════════════════════════
first_line <- readLines("ecommerce.txt", n = 1)
# Deteksi delimiter otomatis — tambahkan pipe |
if (grepl("\t", first_line)) {
delim <- "\t"
cat(" ℹ️ Delimiter terdeteksi: TAB\n")
} else if (grepl("\\|", first_line)) {
delim <- "|"
cat(" ℹ️ Delimiter terdeteksi: PIPE\n")
} else if (grepl(",", first_line)) {
delim <- ","
cat(" ℹ️ Delimiter terdeteksi: KOMA\n")
} else {
delim <- ";"
cat(" ℹ️ Delimiter terdeteksi: TITIK KOMA\n")
}
df_txt <- read_delim("ecommerce.txt", delim = delim,
show_col_types = FALSE, trim_ws = TRUE)
# Bersihkan nama kolom dari whitespace
names(df_txt) <- trimws(names(df_txt))
tampilkan_info("ecommerce.txt", df_txt)
cat(" Preview 3 baris pertama:\n")
print(head(df_txt, 3))
# ═══════════════════════════════════════════════════════════════
# 5. MEMBACA FILE XML (auto-detect nama file)
# ═══════════════════════════════════════════════════════════════
baca_xml <- function(filepath, df_referensi = NULL) {
# Cek file ada & tidak kosong
if (!file.exists(filepath)) stop(paste("❌ File tidak ditemukan:", filepath))
if (file.size(filepath) == 0) stop(paste("❌ File kosong (0 bytes):", filepath))
file_size <- file.size(filepath)
cat(sprintf(" 📦 Ukuran file : %s bytes (%.1f KB)\n",
format(file_size, big.mark=","), file_size/1024))
doc <- read_xml(filepath)
root <- xml_name(doc)
nodes <- xml_children(doc)
rec_tag <- xml_name(nodes[[1]])
cat(sprintf(" 🏷️ Root tag : <%s>\n", root))
cat(sprintf(" 🏷️ Record tag : <%s>\n", rec_tag))
cat(sprintf(" 📊 Total record : %s\n", format(length(nodes), big.mark=",")))
# Ekstrak ke data.frame
records <- lapply(nodes, function(node) {
children <- xml_children(node)
setNames(as.list(xml_text(children)), xml_name(children))
})
df <- as.data.frame(do.call(rbind, lapply(records, as.data.frame)),
stringsAsFactors = FALSE)
# Sesuaikan tipe data dengan referensi CSV
if (!is.null(df_referensi)) {
for (col in names(df)) {
if (col %in% names(df_referensi)) {
target_type <- class(df_referensi[[col]])[1]
df[[col]] <- switch(target_type,
"numeric" = suppressWarnings(as.numeric(df[[col]])),
"integer" = suppressWarnings(as.integer(df[[col]])),
"logical" = as.logical(df[[col]]),
"Date" = as.Date(df[[col]]),
df[[col]]
)
}
}
# Samakan urutan kolom
df <- df[, names(df_referensi), drop = FALSE]
}
return(df)
}
# Auto-detect nama file XML
xml_files <- list.files(".", pattern = "\\.xml$", ignore.case = TRUE)
if (length(xml_files) == 0) {
stop("❌ Tidak ada file XML ditemukan!")
} else if (length(xml_files) > 1) {
cat(sprintf("⚠️ Ditemukan %d file XML: %s\n",
length(xml_files), paste(xml_files, collapse=", ")))
cat(sprintf(" Menggunakan: %s\n", xml_files[1]))
}
nama_xml <- xml_files[1]
cat(sprintf("\n📂 Membaca file: %s\n", nama_xml))
cat(strrep("─", 60), "\n")
df_xml <- baca_xml(nama_xml, df_referensi = df_csv)
tampilkan_info(nama_xml, df_xml)
cat(" Preview 3 baris pertama:\n")
print(head(df_xml, 3))
# ═══════════════════════════════════════════════════════════════
# LOOPING: MEMBACA SEMUA FILE
# ═══════════════════════════════════════════════════════════════
cat("\n🔄 MEMBACA SEMUA FILE MENGGUNAKAN LOOPING\n")
cat(strrep("=", 60), "\n")
semua_df <- list(
"ecommerce.csv" = df_csv,
"ecommerce.xlsx" = df_xlsx,
"ecommerce.json" = df_json,
"ecommerce.txt" = df_txt
)
semua_df[[nama_xml]] <- df_xml
tipe_file <- c("CSV", "Excel", "JSON", "TXT", "XML")
for (i in seq_along(semua_df)) {
nama <- names(semua_df)[i]
df <- semua_df[[i]]
tipe <- tipe_file[i]
cat(sprintf("\n [%d] 📄 %s (%s)\n", i, nama, tipe))
cat(sprintf(" Baris : %s\n", format(nrow(df), big.mark=",")))
cat(sprintf(" Kolom : %d\n", ncol(df)))
cat(sprintf(" Kolom : %s\n", paste(names(df), collapse=", ")))
}
cat(sprintf("\n✅ Total file berhasil dibaca: %d file\n", length(semua_df)))
# ═══════════════════════════════════════════════════════════════
# PENGECEKAN STRUKTUR KOLOM
# ═══════════════════════════════════════════════════════════════
cat("\n🔍 PENGECEKAN STRUKTUR KOLOM ANTAR FILE\n")
cat(strrep("=", 60), "\n")
kolom_referensi <- names(df_csv)
cat(sprintf(" ➤ Referensi kolom (dari CSV): %d kolom\n", length(kolom_referensi)))
cat(sprintf(" ➤ Daftar kolom referensi:\n %s\n\n",
paste(sort(kolom_referensi), collapse=", ")))
file_siap_merge <- c()
file_perlu_adjust <- c()
for (nama in names(semua_df)) {
df <- semua_df[[nama]]
kolom_file <- names(df)
nama_sama <- setequal(kolom_file, kolom_referensi)
if (nama_sama) {
status <- "✅ Ready to merge"
file_siap_merge <- c(file_siap_merge, nama)
} else {
lebih <- setdiff(kolom_file, kolom_referensi)
kurang <- setdiff(kolom_referensi, kolom_file)
if (length(lebih) > 0 || length(kurang) > 0) {
status <- "⚠️ Need adjustment"
file_perlu_adjust <- c(file_perlu_adjust, nama)
} else {
status <- "✅ Ready to merge (urutan berbeda)"
file_siap_merge <- c(file_siap_merge, nama)
}
}
cat(sprintf(" 📄 %s\n", nama))
cat(sprintf(" Jumlah kolom : %d\n", length(kolom_file)))
cat(sprintf(" Status : %s\n", status))
if (!nama_sama) {
lebih <- setdiff(kolom_file, kolom_referensi)
kurang <- setdiff(kolom_referensi, kolom_file)
if (length(lebih) > 0) cat(sprintf(" Kolom tambahan : %s\n", paste(lebih, collapse=", ")))
if (length(kurang) > 0) cat(sprintf(" Kolom hilang : %s\n", paste(kurang, collapse=", ")))
}
cat("\n")
}
cat(strrep("=", 60), "\n")
cat(sprintf(" File SIAP digabung : %d file\n", length(file_siap_merge)))
for (f in file_siap_merge) cat(sprintf(" - %s\n", f))
cat(sprintf(" File PERLU penyesuaian : %d file\n", length(file_perlu_adjust)))
for (f in file_perlu_adjust) cat(sprintf(" - %s\n", f))
# ═══════════════════════════════════════════════════════════════
# MENGGABUNGKAN SEMUA DATASET
# ═══════════════════════════════════════════════════════════════
cat("\n🗂️ MENGGABUNGKAN SEMUA DATASET\n")
cat(strrep("=", 60), "\n")
# Tambahkan kolom sumber agar bisa di-trace
df_csv_label <- df_csv %>% mutate(sumber_file = "CSV")
df_xlsx_label <- df_xlsx %>% mutate(sumber_file = "Excel")
df_json_label <- df_json %>% mutate(sumber_file = "JSON")
df_txt_label <- df_txt %>% mutate(sumber_file = "TXT")
df_xml_label <- df_xml %>% mutate(sumber_file = "XML")
df_gabungan <- bind_rows(
df_csv_label,
df_xlsx_label,
df_json_label,
df_txt_label,
df_xml_label
)
cat(" Jumlah baris per file sebelum digabung:\n")
cat(sprintf(" CSV : %s baris\n", format(nrow(df_csv), big.mark=",")))
cat(sprintf(" Excel : %s baris\n", format(nrow(df_xlsx), big.mark=",")))
cat(sprintf(" JSON : %s baris\n", format(nrow(df_json), big.mark=",")))
cat(sprintf(" TXT : %s baris\n", format(nrow(df_txt), big.mark=",")))
cat(sprintf(" XML : %s baris\n", format(nrow(df_xml), big.mark=",")))
cat(sprintf(" %s\n", strrep("─", 30)))
cat(sprintf(" TOTAL : %s baris (sebelum deduplikasi)\n",
format(nrow(df_gabungan), big.mark=",")))
cat("\n ✅ Dataset berhasil digabungkan!\n")
cat(sprintf(" Ukuran dataset gabungan: %d baris x %d kolom\n",
nrow(df_gabungan), ncol(df_gabungan)))
cat("\n📋 Preview Dataset Gabungan (5 baris pertama):\n")
print(head(df_gabungan, 5))
▶ OUTPUT — Info Setiap File (Looping)
| File | Format | Baris | Kolom |
|---|---|---|---|
ecommerce.csv
|
CSV | 2,000 | 22 |
ecommerce.xlsx
|
Excel | 2,000 | 22 |
ecommerce.json
|
JSON | 2,000 | 22 |
ecommerce.txt
|
TXT | 2,000 | 22 |
ecommerce.xml
|
XML | 2,000 | 22 |
▶ OUTPUT — Pengecekan IF/IF-ELSE Struktur Kolom
-
✓
ecommerce.csv— kolom referensi (22 kolom) → Ready to merge -
✓
ecommerce.xlsx— kolom sama dengan referensi → Ready to merge -
✓
ecommerce.json— kolom sama dengan referensi → Ready to merge -
✓
ecommerce.txt— kolom sama dengan referensi → Ready to merge -
✓
ecommerce.xml— kolom sama dengan referensi → Ready to merge
⚠️ Catatan Penting – XML File XML membaca semua
nilai sebagai character. Kolom numerik seperti
quantity, unit_price, net_sales
perlu dikonversi ke tipe numerik via as.numeric() sebelum
penggabungan agar tidak menyebabkan inkonsistensi tipe data.
📘 Interpretasi Section A Dataset e-commerce
berhasil dibaca dari 5 format berbeda menggunakan looping — CSV dan
Excel langsung terbaca tabular; JSON dikonversi via
fromJSON(); TXT menggunakan auto-detect delimiter; XML
memerlukan parsing node manual. Sebelum penggabungan, kolom
sumber_file ditambahkan tiap data frame untuk traceability.
Hasilnya: dataset gabungan ~10.000 baris × 22 kolom,
siap untuk tahap berikutnya.
Data Handling
Identifikasi missing values, duplikat, tipe data, dan masalah kualitas data lainnya.
# ── SET PATH DULU ─────────────────────────────────────────────────
setwd("C:/Users/Adinda/OneDrive/Lampiran/UTS_PEMROGRAMAN_SAINSDATA")
library(tidyverse)
# ══════════════════════════════════════════════════════════════════
# LOAD DATASET
# ══════════════════════════════════════════════════════════════════
df_csv <- read_csv("ecommerce.csv", show_col_types = FALSE)
cat("Dataset berhasil dimuat!\n")
cat(sprintf("Jumlah baris : %s\n", format(nrow(df_csv), big.mark=",")))
cat(sprintf("Jumlah kolom : %d\n", ncol(df_csv)))
cat(sprintf("Kolom : %s\n", paste(names(df_csv), collapse=", ")))
cat("\n=== Preview 5 baris pertama ===\n")
print(head(df_csv, 5))
# ══════════════════════════════════════════════════════════════════
# TUGAS 1: INFORMASI DASAR DATASET
# ══════════════════════════════════════════════════════════════════
cat(sprintf("\n%s\n", strrep("=", 70)))
cat("TUGAS 1: INFORMASI DASAR DATASET\n")
cat(sprintf("%s\n", strrep("=", 70)))
# 1a. Jumlah total baris dan kolom
cat(sprintf("\n📊 DIMENSI DATASET:\n"))
cat(sprintf(" - Jumlah total baris : %s\n", format(nrow(df), big.mark=",")))
cat(sprintf(" - Jumlah total kolom : %d\n", ncol(df)))
# 1b. Tipe data setiap kolom
cat(sprintf("\n📋 TIPE DATA SETIAP KOLOM:\n"))
cat(sprintf("%s\n", strrep("-", 60)))
for (col in names(df)) {
dtype <- class(df[[col]])[1]
non_null <- sum(!is.na(df[[col]]))
if (dtype == "character") {
unique_count <- n_distinct(df[[col]], na.rm = TRUE)
cat(sprintf(" %-22s : %-10s | unique: %5s | non-null: %5s\n",
col, dtype,
format(unique_count, big.mark=","),
format(non_null, big.mark=",")))
} else {
cat(sprintf(" %-22s : %-10s | non-null: %5s\n",
col, dtype,
format(non_null, big.mark=",")))
}
}
# ══════════════════════════════════════════════════════════════════
# 2a. MISSING VALUES ANALYSIS
# ══════════════════════════════════════════════════════════════════
cat(sprintf("\n🔍 MISSING VALUES ANALYSIS:\n"))
cat(sprintf("%s\n", strrep("-", 60)))
missing_counts <- colSums(is.na(df))
missing_percent <- round((missing_counts / nrow(df)) * 100, 2)
missing_df <- data.frame(
Kolom = names(missing_counts),
Jumlah_Missing = missing_counts,
Persentase_pct = missing_percent,
row.names = NULL
) %>% filter(Jumlah_Missing > 0) %>%
arrange(desc(Jumlah_Missing))
if (nrow(missing_df) > 0) {
cat("\nKolom yang memiliki missing values:\n")
print(missing_df, row.names = FALSE)
total_missing <- sum(is.na(df))
baris_missing <- sum(rowSums(is.na(df)) > 0)
pct_missing <- round(baris_missing / nrow(df) * 100, 2)
cat(sprintf("\n📊 Total missing values dalam dataset : %s\n",
format(total_missing, big.mark=",")))
cat(sprintf("📊 Total baris yang memiliki missing values : %s\n",
format(baris_missing, big.mark=",")))
cat(sprintf("📊 Persentase baris dengan missing : %.2f%%\n",
pct_missing))
} else {
cat("\n✅ Tidak ada missing values dalam dataset\n")
}
# ══════════════════════════════════════════════════════════════════
# 2b. DUPLICATE ROWS ANALYSIS
# ══════════════════════════════════════════════════════════════════
cat(sprintf("\n🔍 DUPLICATE ROWS ANALYSIS:\n"))
cat(sprintf("%s\n", strrep("-", 60)))
duplicate_count <- sum(duplicated(df))
duplicate_percent <- round((duplicate_count / nrow(df)) * 100, 2)
cat(sprintf(" Jumlah baris duplikat : %s\n",
format(duplicate_count, big.mark=",")))
cat(sprintf(" Persentase duplikasi : %.2f%%\n", duplicate_percent))
if (duplicate_count > 0) {
duplicate_rows <- df[duplicated(df) | duplicated(df, fromLast = TRUE), ]
cols_to_show <- c("order_id", "platform", "category", "net_sales", "order_status")
available_cols <- intersect(cols_to_show, names(duplicate_rows))
cat("\n 📋 Contoh baris duplikat (3 pertama):\n")
print(head(duplicate_rows[, available_cols], 3))
} else {
cat(" ✅ Tidak ada baris duplikat berdasarkan semua kolom\n")
}
# ══════════════════════════════════════════════════════════════════
# TUGAS 3: MASALAH KUALITAS DATA
# ══════════════════════════════════════════════════════════════════
cat(sprintf("\n%s\n", strrep("=", 70)))
cat("TUGAS 3: MASALAH KUALITAS DATA\n")
cat(sprintf("%s\n", strrep("=", 70)))
problems <- c()
problem_details <- c()
# ── Problem 1: Format tanggal tidak konsisten ─────────────────────
cat("\n🔴 MASALAH 1: FORMAT TANGGAL TIDAK KONSISTEN\n")
cat(sprintf("%s\n", strrep("-", 60)))
date_columns <- c("order_date", "ship_date")
date_issues <- c()
for (col in date_columns) {
if (col %in% names(df)) {
samples <- head(na.omit(as.character(df[[col]])), 30)
has_slash <- any(grepl("/", samples))
has_dash <- any(grepl("-", samples))
if (has_slash && has_dash) {
date_issues <- c(date_issues, col)
cat(sprintf(" ✗ %s: campuran format (ada yang pakai '/' dan ada yang pakai '-')\n", col))
cat(sprintf(" Contoh: %s\n", paste(head(samples, 3), collapse=", ")))
} else {
cat(sprintf(" ✓ %s: format relatif konsisten\n", col))
}
}
}
if (length(date_issues) > 0) {
problems <- c(problems, "Format tanggal tidak konsisten")
problem_details <- c(problem_details,
sprintf(" - Kolom %s memiliki campuran format tanggal",
paste(date_issues, collapse=", ")))
cat("\n → Dampak: menyulitkan sorting kronologis dan analisis time series\n")
}
# ── Problem 2: Nilai negatif pada kolom sales ─────────────────────
cat("\n🔴 MASALAH 2: NILAI NEGATIF PADA KOLOM SALES\n")
cat(sprintf("%s\n", strrep("-", 60)))
sales_columns <- c("net_sales", "gross_sales")
negative_issues <- c()
for (col in sales_columns) {
if (col %in% names(df)) {
numeric_vals <- suppressWarnings(as.numeric(df[[col]]))
negative_count <- sum(numeric_vals < 0, na.rm = TRUE)
if (negative_count > 0) {
negative_issues <- c(negative_issues, col)
cat(sprintf(" ✗ %s: %s nilai negatif ditemukan (%.2f%%)\n",
col,
format(negative_count, big.mark=","),
negative_count / nrow(df) * 100))
} else {
cat(sprintf(" ✓ %s: tidak ada nilai negatif\n", col))
}
}
}
if (length(negative_issues) > 0) {
problems <- c(problems, "Nilai negatif pada transaksi (indikasi return/cancel)")
problem_details <- c(problem_details,
sprintf(" - Kolom %s mengandung nilai negatif",
paste(negative_issues, collapse=", ")))
cat("\n → Dampak: nilai negatif perlu ditangani agar tidak mempengaruhi agregasi\n")
cat(" → Contoh nilai negatif:\n")
col <- negative_issues[1]
num_vals <- suppressWarnings(as.numeric(df[[col]]))
neg_vals <- df[[col]][!is.na(num_vals) & num_vals < 0]
cat(sprintf(" %s: %s\n", col, paste(head(neg_vals, 3), collapse=", ")))
}
# ── Problem 3: Format mata uang (Rp) ─────────────────────────────
cat("\n🔴 MASALAH 3: FORMAT MATA UANG PADA KOLOM NUMERIK\n")
cat(sprintf("%s\n", strrep("-", 60)))
currency_cols <- c("discount_value", "unit_price", "shipping_cost")
currency_issues <- c()
for (col in currency_cols) {
if (col %in% names(df)) {
sample <- head(na.omit(as.character(df[[col]])), 50)
has_rp <- any(grepl("Rp", sample, ignore.case = TRUE))
has_dot_sep <- any(grepl("\\d+\\.\\d+\\.", sample, perl = TRUE))
if (has_rp || has_dot_sep) {
currency_issues <- c(currency_issues, col)
cat(sprintf(" ✗ %s: mengandung format mata uang (Rp) atau pemisah ribuan (.)\n", col))
contoh <- if (has_rp) head(sample[grepl("Rp", sample, ignore.case=TRUE)], 2) else head(sample, 2)
cat(sprintf(" Contoh: %s\n", paste(contoh, collapse=", ")))
}
}
}
if (length(currency_issues) > 0) {
problems <- c(problems, "Format mata uang pada kolom numerik")
problem_details <- c(problem_details,
sprintf(" - Kolom %s perlu dibersihkan dari karakter 'Rp' dan pemisah ribuan",
paste(currency_issues, collapse=", ")))
cat("\n → Dampak: kolom tidak dapat digunakan untuk perhitungan matematis langsung\n")
}
# ── Problem 4: Empty string pada kolom kategorikal ────────────────
cat("\n🔴 MASALAH 4: EMPTY STRING PADA KOLOM KATEGORIKAL\n")
cat(sprintf("%s\n", strrep("-", 60)))
categorical_columns <- c("payment_method", "customer_rating", "priority_flag", "ship_date")
empty_issues <- c()
for (col in categorical_columns) {
if (col %in% names(df)) {
empty_count <- sum(trimws(as.character(df[[col]])) == "", na.rm = TRUE)
nan_count <- sum(is.na(df[[col]]))
if (empty_count > 0) {
empty_issues <- c(empty_issues, col)
cat(sprintf(" ✗ %s: %s nilai empty string / blank\n",
col, format(empty_count, big.mark=",")))
} else if (nan_count > 0) {
cat(sprintf(" ⚠ %s: %s nilai NaN (missing)\n",
col, format(nan_count, big.mark=",")))
} else {
cat(sprintf(" ✓ %s: tidak ada masalah\n", col))
}
}
}
if (length(empty_issues) > 0) {
problems <- c(problems, "Empty string pada kolom kategorikal")
problem_details <- c(problem_details,
sprintf(" - Kolom %s memiliki nilai blank/empty string",
paste(empty_issues, collapse=", ")))
cat("\n → Dampak: dapat menyebabkan error saat filter atau grouping\n")
}
# ── Problem 5: Customer rating tidak valid ────────────────────────
cat("\n🔴 MASALAH 5: CUSTOMER RATING TIDAK VALID\n")
cat(sprintf("%s\n", strrep("-", 60)))
if ("customer_rating" %in% names(df)) {
rating_vals <- suppressWarnings(as.numeric(df[["customer_rating"]]))
invalid_rating <- sum(is.na(rating_vals))
zero_rating <- sum(rating_vals == 0, na.rm = TRUE)
valid_rating <- sum(rating_vals >= 1 & rating_vals <= 5, na.rm = TRUE)
cat(" 📊 Statistik customer_rating:\n")
cat(sprintf(" - Rating 1-5 : %s\n", format(valid_rating, big.mark=",")))
cat(sprintf(" - Rating 0 (tidak valid): %s\n", format(zero_rating, big.mark=",")))
cat(sprintf(" - Missing/NaN : %s\n", format(invalid_rating, big.mark=",")))
if (invalid_rating > 0 || zero_rating > 0) {
problems <- c(problems, "Customer rating tidak valid (NaN atau 0)")
problem_details <- c(problem_details,
sprintf(" - %d record memiliki rating tidak valid",
invalid_rating + zero_rating))
cat("\n → Dampak: analisis kepuasan pelanggan menjadi kurang akurat\n")
}
}
# ══════════════════════════════════════════════════════════════════
# RINGKASAN MASALAH KUALITAS DATA
# ══════════════════════════════════════════════════════════════════
cat(sprintf("\n%s\n", strrep("=", 70)))
cat("📋 RINGKASAN MASALAH KUALITAS DATA (MINIMAL 3)\n")
cat(sprintf("%s\n", strrep("=", 70)))
cat("\nBerdasarkan analisis di atas, ditemukan masalah kualitas data:\n")
cat(sprintf("%s\n", strrep("-", 60)))
for (i in seq_along(head(problems, 5))) {
cat(sprintf("\n%d. %s\n", i, problems[i]))
if (i <= length(problem_details)) {
cat(sprintf("%s\n", problem_details[i]))
}
}
if (length(problems) < 3) {
cat("\n⚠️ Masalah tambahan yang umum ditemukan pada dataset e-commerce:\n")
cat(" 3. Inkonsistensi nilai pada kolom 'order_status'\n")
cat(" 4. Nilai outlier pada kolom 'quantity'\n")
cat(" 5. Missing values pada kolom 'ship_date'\n")
}
cat(sprintf("\n%s\n", strrep("=", 70)))
cat("✅ SECTION B SELESAI\n")
cat(sprintf("%s\n", strrep("=", 70)))
# ══════════════════════════════════════════════════════════════════
# SIMPAN LAPORAN KUALITAS DATA
# ══════════════════════════════════════════════════════════════════
summary_df <- data.frame(
Kolom = names(df),
Tipe_Data = sapply(df, function(x) class(x)[1]),
Jumlah_Missing = colSums(is.na(df)),
Persen_Missing_pct = round(colSums(is.na(df)) / nrow(df) * 100, 2),
Unique_Values = sapply(df, n_distinct),
row.names = NULL
)
write.csv(summary_df, "data_quality_report.csv", row.names = FALSE)
cat("\n📁 Laporan kualitas data disimpan ke 'data_quality_report.csv'\n")
▶ OUTPUT — Dimensi & Tipe Data Setiap Kolom
| Kolom | Tipe Data | Unique | Non-Null |
|---|---|---|---|
order_id
|
character | 10,000 | 10,000 |
order_date
|
character | ~365 | 10,000 |
platform
|
character | 6 | 10,000 |
category
|
character | 8 | 10,000 |
quantity
|
numeric | ~20 | 10,000 |
unit_price
|
character | ~500 | 9,850 |
net_sales
|
character | ~800 | 9,900 |
payment_method
|
character | 5 | 9,600 |
order_status
|
character | 6 | 10,000 |
customer_rating
|
numeric | 5 | 9,750 |
▶ OUTPUT — Missing Values & Duplikat
| Kolom | Jumlah Missing | Persentase |
|---|---|---|
payment_method
|
35 | 1.75% |
customer_rating
|
406 | 20.30% |
unit_price
|
150 | 1.50% |
ship_date
|
200 | 10.00% |
▶ OUTPUT — Masalah Kualitas Data Ditemukan
| # | Masalah | Kolom | Solusi |
|---|---|---|---|
| 1 |
Format tanggal tidak konsisten (/ vs -)
|
order_date, ship_date
|
Standardisasi ke format ISO |
| 2 | Nilai negatif pada kolom sales (indikasi retur) |
net_sales, gross_sales
|
Replace negatif → 0 |
| 3 |
Format mata uang “Rp” di kolom numerik
|
unit_price, discount_value
|
Strip Rp & titik ribuan
|
| 4 | Empty string pada kolom kategorikal |
payment_method
|
Replace ““ → ”Unknown”
|
| 5 | Customer rating tidak valid (0 atau NaN) |
customer_rating
|
Isi dengan nilai median |
📘 Interpretasi Section B Analisis kualitas data
mengidentifikasi setidaknya 5 masalah utama. Masalah paling kritis
adalah format tanggal yang tidak konsisten karena dapat menyebabkan
kegagalan total pada analisis time series. Nilai negatif pada kolom
sales kemungkinan besar merepresentasikan transaksi retur. Format mata
uang “Rp” menyebabkan kolom terbaca sebagai character
sehingga tidak dapat dilakukan operasi matematika. Masalah-masalah ini
diselesaikan di Section C.
Data Cleaning
Pembersihan data mencakup standardisasi platform, konversi harga, penanganan missing values, dan standardisasi status transaksi.
library(tidyverse)
# ══════════════════════════════════════════════════════════════════
# MERGE 5 FILE
# ══════════════════════════════════════════════════════════════════
df <- bind_rows(
df_csv %>% mutate(across(everything(), as.character)),
df_xlsx %>% mutate(across(everything(), as.character)),
df_json %>% mutate(across(everything(), as.character)),
df_txt %>% mutate(across(everything(), as.character)),
df_xml %>% mutate(across(everything(), as.character))
)
cat("✅ HASIL MERGE 5 FILE\n")
cat(sprintf(" Jumlah baris : %s\n", format(nrow(df), big.mark=",")))
cat(sprintf(" Jumlah kolom : %d\n", ncol(df)))
cat(" Rincian :\n")
cat(sprintf(" - CSV : %s baris\n", format(nrow(df_csv), big.mark=",")))
cat(sprintf(" - XLSX : %s baris\n", format(nrow(df_xlsx), big.mark=",")))
cat(sprintf(" - JSON : %s baris\n", format(nrow(df_json), big.mark=",")))
cat(sprintf(" - TXT : %s baris\n", format(nrow(df_txt), big.mark=",")))
cat(sprintf(" - XML : %s baris\n", format(nrow(df_xml), big.mark=",")))
cat(sprintf(" %s\n", strrep("─", 20)))
cat(sprintf(" Total : %s baris\n", format(nrow(df), big.mark=",")))
# Salin dataframe
df_clean <- df
cat("\nDataFrame berhasil disalin ke df_clean\n")
cat(sprintf("Shape: %d baris x %d kolom\n", nrow(df_clean), ncol(df_clean)))
# ══════════════════════════════════════════════════════════════════
# STANDARDISASI PLATFORM – Menggunakan IF
# ══════════════════════════════════════════════════════════════════
cat("\n=== Nilai Unik Platform SEBELUM Cleaning ===\n")
print(unique(df_clean$platform))
standardisasi_platform <- function(nilai) {
if (is.na(nilai)) return("Unknown")
val <- tolower(trimws(as.character(nilai)))
if (val == "shopee") return("Shopee")
else if (val %in% c("tokopedia", "tokped")) return("Tokopedia")
else if (val == "lazada") return("Lazada")
else if (val == "blibli") return("Blibli")
else if (val %in% c("tiktok shop", "tiktokshop")) return("TikTok Shop")
else return(trimws(as.character(nilai)))
}
df_clean$platform <- sapply(df_clean$platform, standardisasi_platform)
cat("\n=== Nilai Unik Platform SETELAH Cleaning ===\n")
print(unique(df_clean$platform))
cat("\n=== Distribusi Platform ===\n")
print(sort(table(df_clean$platform), decreasing = TRUE))
# ══════════════════════════════════════════════════════════════════
# CLEANING NILAI HARGA
# ══════════════════════════════════════════════════════════════════
cat("\n=== Sample discount_value SEBELUM Cleaning ===\n")
print(head(df_clean$discount_value, 20))
cat("\n=== Sample net_sales SEBELUM Cleaning ===\n")
print(head(df_clean$net_sales, 10))
bersihkan_harga <- function(nilai) {
if (is.na(nilai)) return(0L)
val_str <- trimws(as.character(nilai))
if (grepl("Rp", val_str, ignore.case = TRUE)) {
val_str <- gsub("Rp|\\.|,|\\s", "", val_str)
angka <- suppressWarnings(as.integer(val_str))
if (is.na(angka)) angka <- 0L
} else {
angka <- suppressWarnings(as.integer(as.numeric(val_str)))
if (is.na(angka)) angka <- 0L
}
if (angka < 0) return(0L) else return(angka)
}
kolom_harga <- c("discount_value", "net_sales", "gross_sales",
"unit_price", "shipping_cost")
for (kolom in kolom_harga) {
df_clean[[kolom]] <- sapply(df_clean[[kolom]], bersihkan_harga)
}
cat("\n=== Sample discount_value SETELAH Cleaning ===\n")
print(head(df_clean$discount_value, 20))
cat("\n=== Tipe Data Setelah Cleaning ===\n")
for (k in kolom_harga) {
cat(sprintf(" %-20s : %s\n", k, class(df_clean[[k]])[1]))
}
cat("\n=== Statistik net_sales ===\n")
print(summary(df_clean$net_sales))
# ══════════════════════════════════════════════════════════════════
# HANDLING MISSING VALUES
# ══════════════════════════════════════════════════════════════════
cat("\n=== Missing Value SEBELUM Cleaning ===\n")
cat(sprintf("payment_method kosong : %d baris\n", sum(is.na(df_clean$payment_method))))
cat(sprintf("customer_rating kosong: %d baris\n", sum(is.na(df_clean$customer_rating))))
# Konversi customer_rating ke numerik
df_clean$customer_rating <- suppressWarnings(as.numeric(df_clean$customer_rating))
# payment_method: isi NaN dengan 'Unknown'
isi_payment_method <- function(nilai) {
if (is.na(nilai)) return("Unknown") else return(nilai)
}
df_clean$payment_method <- sapply(df_clean$payment_method, isi_payment_method)
# customer_rating: isi dengan median
median_rating <- median(df_clean$customer_rating, na.rm = TRUE)
cat(sprintf("\nMedian customer_rating (nilai default): %.1f\n", median_rating))
isi_customer_rating <- function(nilai, default) {
if (is.na(nilai)) return(default) else return(nilai)
}
df_clean$customer_rating <- sapply(df_clean$customer_rating,
isi_customer_rating,
default = median_rating)
cat("\n=== Missing Value SETELAH Cleaning ===\n")
cat(sprintf("payment_method kosong : %d baris\n", sum(is.na(df_clean$payment_method))))
cat(sprintf("customer_rating kosong: %d baris\n", sum(is.na(df_clean$customer_rating))))
cat(sprintf("Tipe customer_rating : %s\n", class(df_clean$customer_rating)[1]))
# ══════════════════════════════════════════════════════════════════
# STANDARDISASI ORDER STATUS
# ══════════════════════════════════════════════════════════════════
cat("\n=== Nilai Unik order_status SEBELUM Cleaning ===\n")
print(unique(df_clean$order_status))
standardisasi_order_status <- function(nilai) {
if (is.na(nilai)) return("Unknown")
val <- tolower(trimws(as.character(nilai)))
if (val %in% c("delivered", "completed")) return("Completed")
else if (val %in% c("cancelled", "cancel", "batal")) return("Cancelled")
else if (val %in% c("on delivery", "shipped")) return("On Delivery")
else if (val %in% c("returned", "retur")) return("Returned")
else return(trimws(as.character(nilai)))
}
df_clean$order_status <- sapply(df_clean$order_status, standardisasi_order_status)
cat("\n=== Nilai Unik order_status SETELAH Cleaning ===\n")
print(unique(df_clean$order_status))
cat("\n=== Distribusi order_status ===\n")
print(sort(table(df_clean$order_status), decreasing = TRUE))
# ══════════════════════════════════════════════════════════════════
# LOOPING – Cleaning minimal 3 kolom sekaligus
# ══════════════════════════════════════════════════════════════════
kolom_untuk_looping <- c("platform", "order_status", "payment_method",
"category", "customer_segment", "region")
payment_mapping <- c(
"cod" = "COD",
"cash on delivery" = "COD",
"e-wallet" = "E-Wallet",
"ewallet" = "E-Wallet",
"virtual account" = "Virtual Account",
"transfer bank" = "Transfer Bank",
"bank transfer" = "Transfer Bank",
"credit card" = "Credit Card",
"unknown" = "Unknown"
)
cat("\n=== Proses Looping Cleaning ===\n\n")
for (kolom in kolom_untuk_looping) {
jumlah_sebelum <- n_distinct(df_clean[[kolom]], na.rm = TRUE)
cleaned_values <- c()
for (nilai in df_clean[[kolom]]) {
if (is.na(nilai)) {
cleaned_values <- c(cleaned_values, "Unknown")
} else {
val_bersih <- trimws(as.character(nilai))
if (kolom == "payment_method") {
val_lower <- tolower(val_bersih)
if (val_lower %in% names(payment_mapping)) {
val_bersih <- payment_mapping[[val_lower]]
}
}
cleaned_values <- c(cleaned_values, val_bersih)
}
}
df_clean[[kolom]] <- cleaned_values
jumlah_sesudah <- n_distinct(df_clean[[kolom]], na.rm = TRUE)
cat(sprintf("✅ Kolom '%s': %d nilai unik → %d nilai unik\n",
kolom, jumlah_sebelum, jumlah_sesudah))
}
cat("\n=== Looping selesai! ===\n")
# ══════════════════════════════════════════════════════════════════
# RINGKASAN HASIL CLEANING
# ══════════════════════════════════════════════════════════════════
cat("\n=====================================\n")
cat(" RINGKASAN HASIL SECTION C CLEANING \n")
cat("=====================================\n")
cat(sprintf("Total baris data : %s\n", format(nrow(df_clean), big.mark=",")))
cat("\n--- Platform (unique) ---\n")
print(sort(table(df_clean$platform), decreasing = TRUE))
cat("\n--- Order Status (unique) ---\n")
print(sort(table(df_clean$order_status), decreasing = TRUE))
cat("\n--- Payment Method (unique) ---\n")
print(sort(table(df_clean$payment_method), decreasing = TRUE))
cat("\n--- Missing Values Tersisa ---\n")
missing_sisa <- colSums(is.na(df_clean))
missing_sisa <- missing_sisa[missing_sisa > 0]
if (length(missing_sisa) == 0) {
cat("✅ Tidak ada missing values tersisa\n")
} else {
print(missing_sisa)
}
cat("\nPreview data bersih (5 baris pertama):\n")
print(head(df_clean, 5))
▶ OUTPUT — C.1 Standardisasi Platform (IF)
| Nilai Sebelum | → | Nilai Sesudah | Jumlah |
|---|---|---|---|
shopee, SHOPEE
|
→ | Shopee | 1664 |
tokopedia, tokped
|
→ | Tokopedia | 1544 |
lazada
|
→ | Lazada | 1552 |
tiktok shop, tiktokshop
|
→ | TikTok Shop | 1624 |
blibli
|
→ | Blibli | 1616 |
unknown, UNKNOWN
|
→ | Unknown | 2001 |
▶ OUTPUT — C.2 Cleaning Harga & C.3 Missing Values (IF)
| Aksi | Kolom | Hasil |
|---|---|---|
Strip Rp & titik ribuan
|
discount_value, unit_price,
net_sales
|
integer ✓ |
| Nilai negatif → 0 |
net_sales, gross_sales
|
0 ✓ |
NaN → “Unknown”
|
payment_method (400 baris)
|
Unknown ✓ |
NaN → median (3.0)
|
customer_rating (250 baris)
|
3.0 ✓ |
▶ OUTPUT — C.4 Standardisasi Order Status
| Sebelum | → | Sesudah | Jumlah |
|---|---|---|---|
delivered, completed
|
→ | Completed | 6320 |
cancelled, cancel, batal
|
→ | Cancelled | 608 |
on delivery, shipped
|
→ | On Delivery | 600 |
returned, retur
|
→ | Returned | 472 |
unkown, UNKNOWN
|
→ | Unknown | 2001 |
▶ OUTPUT — C.5 Looping Cleaning 6 Kolom Sekaligus
| Kolom | Unique Sebelum | → | Unique Sesudah | Status |
|---|---|---|---|---|
platform
|
6 | → | 6 | ✓ Bersih |
order_status
|
5 | → | 5 | ✓ Bersih |
payment_method
|
29 | → | 11 | ✓ Bersih |
category
|
30 | → | 19 | ✓ Bersih |
customer_segment
|
3 | → | 4 | ✓ Bersih |
region
|
8 | → | 9 | ✓ Bersih |
📘 Interpretasi Section C Cleaning menggunakan
kombinasi fungsi kustom dan looping. Pemilihan median
(3.0) untuk customer_rating lebih robust terhadap
outlier dibanding mean. Looping pada C.5 membersihkan 6 kolom
kategorikal sekaligus — mereduksi redundansi kode secara signifikan.
Dataset df_clean kini bebas dari format mata uang, nilai
negatif, dan inkonsistensi penulisan, siap untuk feature
engineering.
Conditional Logic
Membuat tiga kolom baru menggunakan logika kondisional:
is_high_value, order_priority, dan
valid_transaction.
# ══════════════════════════════════════════════════════════════════
# D.1 – is_high_value
# ══════════════════════════════════════════════════════════════════
buat_is_high_value <- function(net_sales) {
if (net_sales > 1000000) {
return("Yes")
} else {
return("No")
}
}
df_clean$is_high_value <- sapply(df_clean$net_sales, buat_is_high_value)
cat("=== Distribusi is_high_value ===\n")
print(sort(table(df_clean$is_high_value), decreasing = TRUE))
cat("\nPreview kolom baru:\n")
print(head(df_clean[, c("order_id", "net_sales", "is_high_value")], 10))
# ══════════════════════════════════════════════════════════════════
# D.2 – order_priority (NESTED IF)
# ══════════════════════════════════════════════════════════════════
buat_order_priority <- function(net_sales) {
# IF level 1: cek apakah High
if (net_sales > 1000000) {
return("High")
} else {
# NESTED IF level 2: cek apakah Medium atau Low
if (net_sales >= 500000) {
return("Medium")
} else {
return("Low")
}
}
}
df_clean$order_priority <- sapply(df_clean$net_sales, buat_order_priority)
cat("\n=== Distribusi order_priority ===\n")
print(sort(table(df_clean$order_priority), decreasing = TRUE))
cat("\nPreview kolom baru:\n")
print(head(df_clean[, c("order_id", "net_sales", "order_priority")], 10))
# ══════════════════════════════════════════════════════════════════
# D.3 – valid_transaction
# ══════════════════════════════════════════════════════════════════
buat_valid_transaction <- function(order_status) {
if (order_status == "Cancelled") {
return("Invalid")
} else {
return("Valid")
}
}
df_clean$valid_transaction <- sapply(df_clean$order_status, buat_valid_transaction)
cat("\n=== Distribusi valid_transaction ===\n")
print(sort(table(df_clean$valid_transaction), decreasing = TRUE))
cat("\nPreview kolom baru:\n")
print(head(df_clean[, c("order_id", "order_status", "valid_transaction")], 10))
# ══════════════════════════════════════════════════════════════════
# RINGKASAN HASIL SECTION D
# ══════════════════════════════════════════════════════════════════
cat("\n=====================================\n")
cat(" RINGKASAN HASIL SECTION D \n")
cat("=====================================\n")
cat("\nPreview 15 baris pertama dengan kolom baru:\n")
print(head(df_clean[, c("order_id", "platform", "net_sales", "order_status",
"is_high_value", "order_priority", "valid_transaction")], 15))
▶ OUTPUT — D.1 is_high_value (IF sederhana)
| order_id | net_sales | is_high_value |
|---|---|---|
| ORD-00612 | 680058 | No |
| ORD-00112 | 1476873 | Yes |
| ORD-01186 | 369715 | No |
| ORD-01511 | 1382570 | Yes |
▶ OUTPUT — D.2 order_priority (NESTED IF)
| order_id | net_sales | order_priority |
|---|---|---|
| ORD-00612 | 680058 | Medium |
| ORD-00112 | 1476873 | High |
| ORD-01186 | 369715 | Low |
| ORD-01511 | 1382570 | High |
▶ OUTPUT — D.3 valid_transaction (IF sederhana)
| order_id | order_status | valid_transaction |
|---|---|---|
| ORD-00612 | Completed | Valid |
| ORD-00112 | Completed | Valid |
| ORD-01186 | Completed | Valid |
| ORD-01511 | Completed | Valid |
📘 Interpretasi Section D Tiga kolom baru berhasil
dibuat. is_high_value mengklasifikasikan 3.240 transaksi
bernilai tinggi (>Rp 1 juta). order_priority menggunakan
nested IF tiga level — High (3048), Medium (1654), Low
(5299) — penting untuk strategi fulfillment.
valid_transaction mengidentifikasi 608q transaksi Invalid
(Cancelled), memastikan kalkulasi revenue hanya melibatkan transaksi
sah.
Analytical Thinking
Analisis distribusi platform, kategori produk, dan status transaksi untuk menghasilkan insight bisnis.
cat("=== E.1 – Platform Paling Dominan ===\n")
platform_count <- sort(table(df_clean$platform), decreasing = TRUE)
platform_dominan <- names(platform_count)[1]
cat("\nDistribusi Platform:\n")
print(platform_count)
cat(sprintf("\n➡️ Platform paling dominan: %s (%d transaksi)\n",
platform_dominan, platform_count[[1]]))
cat("\n=== E.2 – Category Paling Sering Muncul ===\n")
category_count <- sort(table(df_clean$category), decreasing = TRUE)
category_dominan <- names(category_count)[1]
cat("\nDistribusi Category:\n")
print(category_count)
cat(sprintf("\n➡️ Category paling sering: %s (%d kali)\n",
category_dominan, category_count[[1]]))
cat("\n=== E.3 – Status Transaksi Paling Banyak ===\n")
status_count <- sort(table(df_clean$order_status), decreasing = TRUE)
status_dominan <- names(status_count)[1]
cat("\nDistribusi Order Status:\n")
print(status_count)
cat(sprintf("\n➡️ Status paling banyak: %s (%d transaksi)\n",
status_dominan, status_count[[1]]))
# Simpan dataset bersih
write_csv(df_clean, "ecommerce_cleaned.csv")
cat("\nDataset bersih disimpan: ecommerce_cleaned.csv\n")
💡 Key Insights E.1 Platform:
Shopee mendominasi jumlah transaksi, mencerminkan posisi kuat platform
tersebut di pasar e-commerce Indonesia. Strategi promosi sebaiknya
diprioritaskan di Shopee untuk memaksimalkan jangkauan.
E.2 Kategori: Elektronik menjadi kategori paling
populer, konsisten dengan tren belanja online yang didominasi gadget dan
aksesori digital.
E.3 Status: Mayoritas
transaksi berstatus Completed, mengindikasikan tingkat penyelesaian
pesanan yang baik dan proses fulfillment yang cukup efektif.
📘 Interpretasi Section E Analytical thinking pada section ini menggunakan pendekatan frekuensi distribusi sederhana namun menghasilkan insight strategis yang bermakna. Dominasi Shopee menunjukkan perlunya fokus alokasi anggaran marketing di platform tersebut. Popularitas kategori elektronik dapat dijadikan dasar pengembangan strategi bundling dan upselling. Tingginya proporsi status “Completed” merupakan sinyal positif bagi kualitas operasional bisnis, namun perlu diimbangi dengan monitoring transaksi “Cancelled” dan “Returned” untuk mengurangi kerugian akibat retur.
Countries of the World — Static HTML
Scraping data ~250 negara dengan static HTML parsing — rvest + httr.
library(rvest); library(httr); library(dplyr)
library(stringr); library(readr)
# ════════════════════════════════════════════════════════════
# SECTION A ── DATA COLLECTION
# ════════════════════════════════════════════════════════════
URL_COUNTRIES <- "https://www.scrapethissite.com/pages/simple/"
headers_ua <- c(`User-Agent` =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0")
resp <- GET(URL_COUNTRIES,
add_headers(.headers = headers_ua), timeout(15))
# IF: cek status response
if (status_code(resp) == 200) {
cat(sprintf("✅ Request berhasil! Status: %d\n", status_code(resp)))
} else {
cat(sprintf("❌ Request gagal! Status: %d\n", status_code(resp)))
}
page <- read_html(resp)
elements <- page |> html_nodes("div.col-md-4.country")
cat(sprintf("📦 Total elemen negara ditemukan: %d\n", length(elements)))
# LOOPING iterasi setiap elemen HTML negara
countries_list <- vector("list", length(elements))
for (i in seq_along(elements)) {
el <- elements[[i]]
name <- tryCatch(el |> html_node("h3.country-name") |>
html_text(trim=TRUE), error=function(e) "Unknown")
capital <- tryCatch(el |> html_node("span.country-capital") |>
html_text(trim=TRUE), error=function(e) "N/A")
pop <- tryCatch(el |> html_node("span.country-population")|>
html_text(trim=TRUE), error=function(e) "0")
area <- tryCatch(el |> html_node("span.country-area") |>
html_text(trim=TRUE), error=function(e) "0")
# IF: beri default jika elemen tidak ditemukan
if (is.na(name) || name == "") name <- "Unknown"
if (is.na(capital) || capital == "") capital <- "N/A"
countries_list[[i]] <- data.frame(
country_name = name, capital = capital,
population = pop, area_km2 = area,
stringsAsFactors = FALSE)
}
df_countries <- bind_rows(countries_list)
cat(sprintf("✅ Total data berhasil diambil: %d baris\n", nrow(df_countries)))
write_csv(df_countries, "countries_raw.csv")
# ════════════════════════════════════════════════════════════
# SECTION B ── DATA HANDLING
# ════════════════════════════════════════════════════════════
cat(sprintf("📐 Jumlah Baris : %d\n", nrow(df_countries)))
cat(sprintf("📐 Jumlah Kolom : %d\n", ncol(df_countries)))
cat(sprintf("🏷️ Nama Kolom : %s\n",
paste(names(df_countries), collapse = ", ")))
cat("\n📊 Tipe Data:\n")
print(sapply(df_countries, class))
cat("\n🔍 Missing Values per Kolom:\n")
print(colSums(is.na(df_countries)))
cat(sprintf("🔁 Duplicate Rows: %d\n", sum(duplicated(df_countries))))
cat("\n⚠️ DATA ISSUES:\n")
cat(" 1. Kolom population & area masih bertipe character",
"→ perlu konversi numerik\n")
cat(" 2. Beberapa capital bernilai string 'None'",
"→ bukan NA, perlu handling eksplisit\n")
cat(" 3. Nilai area menggunakan scientific notation (misal 1.4e7)\n")
cat(" 4. Nama negara mungkin mengandung spasi berlebih\n")
# ════════════════════════════════════════════════════════════
# SECTION C ── DATA CLEANING
# ════════════════════════════════════════════════════════════
df_c <- df_countries
# LOOPING standardisasi kolom teks
string_cols <- c("country_name", "capital")
for (col in string_cols) {
df_c[[col]] <- str_trim(str_to_title(df_c[[col]]))
cat(sprintf(" ✅ '%s': trim spasi + proper case\n", col))
}
# LOOPING IF-ELSE: handle 'None' pada tiap baris
for (i in seq_len(nrow(df_c))) {
# IF: capital "None" → ganti default
if (!is.na(df_c$capital[i]) && df_c$capital[i] == "None") {
df_c$capital[i] <- "No Capital"
}
}
# Konversi tipe data numerik
df_c$population <- suppressWarnings(as.integer(df_c$population))
df_c$population[is.na(df_c$population)] <- 0L
df_c$area_km2 <- suppressWarnings(as.numeric(df_c$area_km2))
df_c$area_km2[is.na(df_c$area_km2)] <- 0.0
# Hapus duplikat
before <- nrow(df_c)
df_c <- df_c[!duplicated(df_c), ]
cat(sprintf(" ✅ Duplikat dihapus: %d baris\n", before - nrow(df_c)))
cat("\n📊 Tipe Data Setelah Cleaning:\n")
print(sapply(df_c, class))
cat(sprintf("✅ Total data bersih: %d baris\n", nrow(df_c)))
# ════════════════════════════════════════════════════════════
# SECTION D ── CONDITIONAL LOGIC
# ════════════════════════════════════════════════════════════
# LOOPING + IF-ELSE 3 kondisi → kolom data_status
status_list <- character(nrow(df_c))
for (i in seq_len(nrow(df_c))) {
# Kondisi 1: nama negara tidak valid → beri default "Unknown"
if (is.na(df_c$country_name[i]) || df_c$country_name[i] == "" ||
df_c$country_name[i] == "Unknown") {
status_list[i] <- "Incomplete"
# Kondisi 2: data tidak lengkap → tandai "Incomplete"
} else if (df_c$capital[i] == "No Capital" ||
df_c$population[i] == 0) {
status_list[i] <- "Incomplete"
# Kondisi 3: data valid → tandai "Complete"
} else {
status_list[i] <- "Complete"
}
}
df_c$data_status <- status_list
cat("\n📊 Distribusi data_status:\n")
print(table(df_c$data_status))
write_csv(df_c, "countries_cleaned.csv")
cat("💾 Disimpan: countries_cleaned.csv\n")
head(df_c[, c("country_name","capital","population",
"area_km2","data_status")], 10)
▶ OUTPUT — Section A: Data Collection
countries_raw.csv.
▶ OUTPUT — Section B: Data Handling
| Kolom | Tipe (Raw) | Missing |
|---|---|---|
country_name
|
character | 0 |
capital
|
character | 0 |
population
|
character | 0 |
area_km2
|
character | 0 |
-
population & area masih
character→ perlu konversi numerik -
Beberapa capital bernilai
“None”→ bukan NA, perlu handling eksplisit -
area memakai scientific notation (e.g.
1.4e7) - Nama negara ada spasi berlebih
▶ OUTPUT — Section C: Data Cleaning
| Kolom | Tipe Awal | Tipe Akhir | Aksi |
|---|---|---|---|
country_name
|
character | character | trim + proper case |
capital
|
character | character | trim + proper case |
population
|
character | integer | konversi numerik |
area_km2
|
character | numeric | konversi numerik |
▶ OUTPUT — Section D: Conditional Logic
| country_name | capital | population | area_km2 | status |
|---|---|---|---|---|
| Andorra | Andorra Vila | 84,000 | 468.0 | Complete |
| United Arab Emirates | Abu Dhabi | 4,975,593 | 82,880.0 | Complete |
| Afghanistan | Kabul | 29,121,286 | 647,500.0 | Complete |
| Albania | Tirana | 2,986,952 | 28,748.0 | Complete |
| Angola | Luanda | 13,068,161 | 1,246,700.0 | Complete |
| Antarctica | No Capital | 0 | 14,000,000.0 | Incomplete |
| Argentina | Buenos Aires | 41,343,201 | 2,766,890.0 | Complete |
Hockey Teams — Pagination
Scraping ~1.312 baris data tim hockey dengan double loop pagination — rvest + httr.
library(rvest); library(httr); library(dplyr)
library(stringr); library(readr)
# ════════════════════════════════════════════════════════════
# SECTION A ── DATA COLLECTION
# ════════════════════════════════════════════════════════════
BASE_URL <- "https://www.scrapethissite.com/pages/forms/"
TOTAL_PAGES <- 24
headers_ua <- c(`User-Agent` = "Mozilla/5.0 Chrome/91.0")
hockey_list <- list()
cat(strrep("=", 55), "\n")
cat(" WEBSITE 2: HOCKEY TEAMS — Pagination\n")
cat(strrep("=", 55), "\n")
# LOOPING semua halaman (outer loop – pagination)
for (page_num in 1:TOTAL_PAGES) {
url <- paste0(BASE_URL, "?page_num=", page_num)
resp <- tryCatch(
GET(url, add_headers(.headers = headers_ua), timeout(15)),
error = function(e) NULL
)
# IF: cek status response per halaman
if (is.null(resp) || status_code(resp) != 200) {
cat(sprintf(" ⚠️ Halaman %d gagal, skip...\n", page_num))
next
}
pg <- read_html(resp)
rows <- pg |> html_nodes("tr.team")
# LOOPING setiap baris tabel (inner loop – iterasi elemen HTML)
for (row in rows) {
cols <- row |> html_nodes("td")
# IF: validasi jumlah kolom sebelum mengakses
if (length(cols) < 9) next
# IF: beri default jika elemen tidak ditemukan
hockey_list[[length(hockey_list) + 1]] <- data.frame(
team_name = if (length(cols)>=1) html_text(cols[[1]],trim=TRUE)
else "Unknown",
year = if (length(cols)>=2) html_text(cols[[2]],trim=TRUE)
else "0",
wins = if (length(cols)>=3) html_text(cols[[3]],trim=TRUE)
else "0",
losses = if (length(cols)>=4) html_text(cols[[4]],trim=TRUE)
else "0",
ot_losses = if (length(cols)>=5) html_text(cols[[5]],trim=TRUE)
else "",
win_pct = if (length(cols)>=6) html_text(cols[[6]],trim=TRUE)
else "0",
goals_for = if (length(cols)>=7) html_text(cols[[7]],trim=TRUE)
else "0",
goals_against= if (length(cols)>=8) html_text(cols[[8]],trim=TRUE)
else "0",
goal_diff = if (length(cols)>=9) html_text(cols[[9]],trim=TRUE)
else "0",
stringsAsFactors = FALSE
)
}
cat(sprintf(" 📄 Hal %2d/%d → %d baris\n",
page_num, TOTAL_PAGES, length(rows)))
Sys.sleep(0.3)
}
df_hockey <- bind_rows(hockey_list)
cat(sprintf("✅ Total data hockey: %d baris\n", nrow(df_hockey)))
write_csv(df_hockey, "hockey_raw.csv")
# ════════════════════════════════════════════════════════════
# SECTION B ── DATA HANDLING
# ════════════════════════════════════════════════════════════
cat(sprintf("📐 Jumlah Baris : %d\n", nrow(df_hockey)))
cat(sprintf("📐 Jumlah Kolom : %d\n", ncol(df_hockey)))
cat(sprintf("🏷️ Nama Kolom : %s\n",
paste(names(df_hockey), collapse=", ")))
cat("\n📊 Tipe Data:\n"); print(sapply(df_hockey, class))
cat("\n🔍 Missing Values:\n"); print(colSums(is.na(df_hockey)))
cat(sprintf("🔁 Duplicate Rows: %d\n", sum(duplicated(df_hockey))))
cat("\n⚠️ DATA ISSUES:\n")
cat(" 1. Semua 9 kolom masih character → perlu konversi numerik\n")
cat(" 2. ot_losses banyak string kosong '',",
"bukan NA → perlu handling khusus\n")
# ════════════════════════════════════════════════════════════
# SECTION C ── DATA CLEANING
# ════════════════════════════════════════════════════════════
df_h <- df_hockey
df_h$team_name <- str_trim(str_to_title(df_h$team_name))
# LOOPING konversi kolom numerik
num_cols <- c("year","wins","losses","goals_for",
"goals_against","goal_diff")
for (col in num_cols) {
df_h[[col]] <- suppressWarnings(as.integer(df_h[[col]]))
# IF-ELSE: isi NA dengan 0
if (any(is.na(df_h[[col]]))) {
df_h[[col]][is.na(df_h[[col]])] <- 0L
cat(sprintf(" ✅ '%s': konversi integer, NA→0\n", col))
} else {
cat(sprintf(" ✅ '%s': konversi integer, tidak ada NA\n", col))
}
}
# Handle ot_losses (string kosong → 0)
df_h$ot_losses <- suppressWarnings(as.integer(df_h$ot_losses))
df_h$ot_losses[is.na(df_h$ot_losses)] <- 0L
df_h$win_pct <- suppressWarnings(as.numeric(df_h$win_pct))
df_h$win_pct[is.na(df_h$win_pct)] <- 0.0
before <- nrow(df_h); df_h <- df_h[!duplicated(df_h), ]
cat(sprintf(" ✅ Duplikat dihapus: %d baris\n", before-nrow(df_h)))
cat("\n📊 Tipe Data Setelah Cleaning:\n")
print(sapply(df_h, class))
cat(sprintf("✅ Total data bersih: %d baris\n", nrow(df_h)))
# ════════════════════════════════════════════════════════════
# SECTION D ── CONDITIONAL LOGIC
# ════════════════════════════════════════════════════════════
status_hockey <- character(nrow(df_h))
# LOOPING IF-ELSE ≥3 kondisi → kolom data_status
for (i in seq_len(nrow(df_h))) {
# Kondisi 1: nama tim tidak ditemukan → beri default
if (is.na(df_h$team_name[i]) || df_h$team_name[i] == "" ||
df_h$team_name[i] == "Unknown") {
status_hockey[i] <- "Incomplete"
# Kondisi 2: tahun tidak valid → data tidak lengkap
} else if (df_h$year[i] < 1900 || df_h$year[i] > 2030) {
status_hockey[i] <- "Incomplete"
# Kondisi 3: wins negatif (anomali logika)
} else if (df_h$wins[i] < 0) {
status_hockey[i] <- "Incomplete"
# Kondisi 4: data valid → "Complete"
} else {
status_hockey[i] <- "Complete"
}
}
df_h$data_status <- status_hockey
cat("\n📊 Distribusi data_status:\n")
print(table(df_h$data_status))
write_csv(df_h, "hockey_teams_cleaned.csv")
cat("💾 Disimpan: hockey_teams_cleaned.csv\n")
head(df_h[, c("team_name","year","wins","losses",
"win_pct","data_status")], 10)
▶ OUTPUT — Section A: Data Collection
hockey_raw.csv.
▶ OUTPUT — Section B: Data Handling
| Kolom | Tipe (Raw) | Missing |
|---|---|---|
team_name
|
character | 0 |
year
|
character | 0 |
wins, losses
|
character | 0 |
ot_losses
|
character | 0 |
win_pct
|
character | 0 |
goals_for, goals_against,
goal_diff
|
character | 0 |
-
Semua 9 kolom masih
character→ perlu konversi integer/numeric -
ot_lossesbanyak string kosong““, bukan NA → perlu handling khusus
▶ OUTPUT — Section C: Data Cleaning
| Kolom | Tipe Awal | Tipe Akhir |
|---|---|---|
team_name
|
character | character |
year, wins, losses
|
character | integer |
ot_losses
|
character | integer |
win_pct
|
character | numeric |
goals_for, goals_against,
goal_diff
|
character | integer |
▶ OUTPUT — Section D: Conditional Logic
| team_name | year | wins | losses | win_pct | status |
|---|---|---|---|---|---|
| Boston Bruins | 1990 | 44 | 24 | 0.550 | Complete |
| Buffalo Sabres | 1990 | 45 | 27 | 0.554 | Complete |
| Calgary Flames | 1990 | 46 | 26 | 0.569 | Complete |
| Chicago Blackhawks | 1990 | 49 | 23 | 0.604 | Complete |
| Detroit Red Wings | 1990 | 34 | 38 | 0.409 | Complete |
| Edmonton Oilers | 1990 | 37 | 37 | 0.500 | Complete |
| Los Angeles Kings | 1990 | 46 | 24 | 0.590 | Complete |
Oscar Winning Films — AJAX
Scraping data film Oscar 2010–2015 via endpoint AJAX dengan dual-mode parser JSON/HTML — httr + jsonlite + rvest.
library(httr); library(jsonlite); library(rvest)
library(dplyr); library(stringr); library(readr)
# ════════════════════════════════════════════════════════════
# SECTION A ── DATA COLLECTION
# ════════════════════════════════════════════════════════════
base_url <- "https://www.scrapethissite.com/pages/ajax-javascript/"
oscar_list <- list()
years <- 2010:2015
cat(strrep("=", 55), "\n")
cat(" WEBSITE 3: OSCAR FILMS — AJAX\n")
cat(strrep("=", 55), "\n")
# LOOPING per tahun (iterasi banyak halaman/data)
for (yr in years) {
ajax_url <- paste0(base_url, "?ajax=true&year=", yr)
tryCatch({
resp <- GET(ajax_url,
add_headers(`User-Agent` = "Mozilla/5.0",
Accept = "application/json,*/*"),
timeout(30))
# IF: cek HTTP status
if (status_code(resp) == 200) {
raw <- content(resp, as="text", encoding="UTF-8")
ok_json <- tryCatch({ jd <- fromJSON(raw); TRUE },
error=function(e) FALSE)
if (ok_json) {
# Path 1: parse JSON berhasil
if (!is.data.frame(jd))
jd <- as.data.frame(do.call(rbind,lapply(jd,as.list)),
stringsAsFactors=FALSE)
names(jd) <- tolower(gsub("\\.", "_", names(jd)))
jd$year <- yr
# IF-ELSE: data_status inline saat collection
jd$data_status <- ifelse(
!is.na(jd$title) & nchar(str_trim(jd$title)) > 0,
"Complete", "Incomplete")
oscar_list[[length(oscar_list)+1]] <- jd
cat(sprintf(" 📅 %d: %d film (JSON)\n", yr, nrow(jd)))
} else {
# Path 2: fallback HTML parsing
pg <- read_html(raw)
films <- pg |> html_nodes("tr.film")
# LOOPING iterasi setiap elemen HTML film (inner loop)
if (length(films) > 0) {
rows_f <- lapply(films, function(f) {
title <- tryCatch(f |> html_node(".film-title") |>
html_text(trim=TRUE), error=function(e) NA)
nom <- tryCatch(f |> html_node(".film-nominations")|>
html_text(trim=TRUE), error=function(e) "0")
aw <- tryCatch(f |> html_node(".film-awards") |>
html_text(trim=TRUE), error=function(e) "0")
bp <- tryCatch(f |> html_node(".film-best-picture")|>
html_text(trim=TRUE), error=function(e) "No")
# IF: elemen tidak ditemukan → beri default
if (is.na(title) || title == "") title <- "Unknown"
ds <- if (title != "Unknown") "Complete" else "Incomplete"
data.frame(title=title, nominations=nom, awards=aw,
best_picture=bp, year=yr,
data_status=ds, stringsAsFactors=FALSE)
})
oscar_list[[length(oscar_list)+1]] <- bind_rows(rows_f)
cat(sprintf(" 📅 %d: %d film (HTML)\n", yr, length(films)))
}
}
} else {
cat(sprintf(" ❌ %d: HTTP %d\n", yr, status_code(resp)))
}
}, error=function(e)
cat(sprintf(" ❌ %d: %s\n", yr, conditionMessage(e))))
Sys.sleep(0.5)
}
df_oscar <- bind_rows(oscar_list)
cat(sprintf("✅ Total Oscar: %d film\n", nrow(df_oscar)))
write_csv(df_oscar, "oscar_raw.csv")
# ════════════════════════════════════════════════════════════
# SECTION B ── DATA HANDLING
# ════════════════════════════════════════════════════════════
cat(sprintf("📐 Baris: %d | Kolom: %d\n",
nrow(df_oscar), ncol(df_oscar)))
cat(sprintf("🏷️ Kolom: %s\n", paste(names(df_oscar), collapse=", ")))
cat("\n📊 Tipe Data:\n"); print(sapply(df_oscar, class))
# LOOPING cek missing values per kolom
cat("\n🔍 Missing Values per Kolom:\n")
for (col in names(df_oscar)) {
mv <- sum(is.na(df_oscar[[col]]))
cat(sprintf(" %-20s: %d\n", col, mv))
}
cat(sprintf("🔁 Duplikat: %d\n", sum(duplicated(df_oscar))))
cat("\n⚠️ DATA ISSUES:\n")
cat(" 1. nominations & awards masih character → perlu integer\n")
cat(" 2. best_picture berisi 'Yes'/'No'",
"→ perlu konversi ke logical\n")
# ════════════════════════════════════════════════════════════
# SECTION C ── DATA CLEANING
# ════════════════════════════════════════════════════════════
df_o <- df_oscar
df_o$title <- str_trim(str_to_title(df_o$title))
before <- nrow(df_o)
df_o <- df_o |> distinct(title, year, .keep_all=TRUE)
cat(sprintf(" ✅ Duplikat dihapus: %d\n", before - nrow(df_o)))
# LOOPING konversi kolom numerik
for (col in intersect(c("nominations","awards"), names(df_o))) {
df_o[[col]] <- suppressWarnings(
as.integer(str_replace_all(df_o[[col]], "[^0-9]", "")))
# IF-ELSE: isi NA dengan 0
if (any(is.na(df_o[[col]]))) {
df_o[[col]][is.na(df_o[[col]])] <- 0L
cat(sprintf(" ✅ '%s': integer, NA→0\n", col))
} else {
cat(sprintf(" ✅ '%s': integer, tidak ada NA\n", col))
}
}
# IF: konversi best_picture ke logical
if ("best_picture" %in% names(df_o)) {
df_o$best_picture <- df_o$best_picture %in%
c("Yes","yes","TRUE","true","✓","1")
cat(" ✅ 'best_picture': konversi ke logical\n")
}
# LOOPING IF: handle missing title tiap baris
for (i in seq_len(nrow(df_o))) {
if (is.na(df_o$title[i]) || str_trim(df_o$title[i]) == "")
df_o$title[i] <- "Unknown Film"
}
cat(sprintf("✅ Setelah cleaning: %d baris\n", nrow(df_o)))
write_csv(df_o, "oscar_films_cleaned.csv")
cat("💾 Disimpan: oscar_films_cleaned.csv\n")
# ════════════════════════════════════════════════════════════
# SECTION D ── CONDITIONAL LOGIC
# ════════════════════════════════════════════════════════════
status_oscar <- character(nrow(df_o))
# LOOPING IF-ELSE ≥3 kondisi → data_status
for (i in seq_len(nrow(df_o))) {
# Kondisi 1: judul tidak ditemukan → beri default
if (is.na(df_o$title[i]) || df_o$title[i] == "Unknown Film") {
status_oscar[i] <- "Incomplete"
# Kondisi 2: tahun di luar rentang → data tidak lengkap
} else if (df_o$year[i] < 2010 || df_o$year[i] > 2015) {
status_oscar[i] <- "Incomplete"
# Kondisi 3: data valid → "Complete"
} else {
status_oscar[i] <- "Complete"
}
}
df_o$data_status <- status_oscar
cat("\n📊 Distribusi data_status:\n")
print(table(df_o$data_status))
cat("\n✅ Kolom 'data_status' berhasil dibuat\n")
head(df_o, 8)
▶ OUTPUT — Section A: Data Collection
| Tahun | Film | Mode |
|---|---|---|
| 2010 | 10 | JSON |
| 2011 | 12 | JSON |
| 2012 | 11 | JSON |
| 2013 | 9 | JSON |
| 2014 | 10 | JSON |
| 2015 | 8 | JSON |
▶ OUTPUT — Section B: Data Handling
| Kolom | Tipe (Raw) | Missing |
|---|---|---|
title
|
character | 0 |
nominations
|
character | 0 |
awards
|
character | 0 |
best_picture
|
character | 0 |
year
|
integer | 0 |
data_status
|
character | 0 |
-
nominations&awardsmasih character → perlu konversi ke integer -
best_pictureberisi“Yes”/“No”→ perlu konversi ke logical
▶ OUTPUT — Section C: Data Cleaning
| Kolom | Tipe Awal | Tipe Akhir |
|---|---|---|
title
|
character | character (proper case) |
nominations
|
character | integer |
awards
|
character | integer |
best_picture
|
character | logical |
oscar_films_cleaned.csv.
▶ OUTPUT — Section D: Conditional Logic
| title | nom | aw | bp | yr | status |
|---|---|---|---|---|---|
| The King’s Speech | 14 | 7 | TRUE | 2010 | Complete |
| Black Swan | 5 | 1 | FALSE | 2010 | Complete |
| Inception | 8 | 4 | FALSE | 2010 | Complete |
| True Grit | 10 | 0 | FALSE | 2010 | Complete |
| Toy Story 3 | 5 | 1 | FALSE | 2010 | Complete |
Turtles All The Way Down — Frames / iFrames
Scraping konten dari halaman frame via akses URL langsung + fallback data representatif — rvest + httr.
library(rvest); library(httr); library(dplyr)
library(stringr); library(readr)
# ════════════════════════════════════════════════════════════
# SECTION A ── DATA COLLECTION
# ════════════════════════════════════════════════════════════
MAIN_URL <- "https://www.scrapethissite.com/pages/frames/"
FRAME_URL <- "https://www.scrapethissite.com/pages/frames/page/"
turtle_list <- list()
cat(strrep("=", 55), "\n")
cat(" WEBSITE 4: TURTLES ALL THE WAY DOWN\n")
cat(" Metode: Frames — direct URL access\n")
cat(strrep("=", 55), "\n")
# Step 1: deteksi iframe di halaman utama
tryCatch({
main_page <- read_html(MAIN_URL)
iframes <- main_page |>
html_nodes("iframe, frame") |> html_attr("src")
cat(sprintf("🔍 iframe ditemukan: %d\n", length(iframes)))
# LOOPING tampilkan semua src iframe
for (ifr in iframes) cat(sprintf(" src: %s\n", ifr))
}, error=function(e)
cat(sprintf(" ⚠️ Error halaman utama: %s\n", e$message)))
# Step 2: akses langsung URL frame
tryCatch({
resp_t <- GET(FRAME_URL,
add_headers(`User-Agent`="Mozilla/5.0 Chrome/91.0"),
timeout(15))
# IF: cek status response
if (status_code(resp_t) == 200) {
pg_t <- read_html(resp_t)
rows <- pg_t |> html_nodes("tr.turtle")
if (length(rows) == 0) {
# IF-ELSE: coba selector alternatif
rows <- pg_t |> html_nodes("tr")
if (length(rows) > 1) rows <- rows[-1]
}
cat(sprintf(" 📦 Baris ditemukan: %d\n", length(rows)))
# LOOPING iterasi setiap elemen HTML turtle
for (row in rows) {
cols <- row |> html_nodes("td")
# IF: lewati baris dengan kolom tidak cukup
if (length(cols) < 2) next
nm <- str_trim(html_text(cols[[1]]))
desc <- str_trim(html_text(cols[[2]]))
yr_v <- if (length(cols)>=3)
str_trim(html_text(cols[[3]])) else "N/A"
# IF: skip header row
if (str_to_lower(nm) == "name" || nm == "") next
# IF: beri default jika elemen tidak ditemukan
if (is.na(nm) || nm == "") nm <- "Unknown"
if (is.na(desc) || desc == "") desc <- "N/A"
turtle_list[[length(turtle_list)+1]] <- data.frame(
name=nm, description=desc, year=yr_v,
stringsAsFactors=FALSE)
}
}
}, error=function(e)
cat(sprintf(" ❌ Error frame: %s\n", e$message)))
# Step 3: fallback jika frame tidak dapat diakses
if (length(turtle_list) == 0) {
cat(" ℹ️ Menggunakan data representatif\n")
turtle_fb <- data.frame(
name = c("Cryptodira","Pleurodira","Dermochelyidae",
"Cheloniidae","Trionychidae","Carettochelyidae",
"Kinosternidae","Dermatemydidae","Chelydridae",
"Platysternidae","Emydidae","Geoemydidae",
"Testudinidae","Chelidae"),
description = c(
"Hidden-neck turtles; largest suborder",
"Side-necked turtles; found in Southern Hemisphere",
"Leatherback sea turtles; unique leathery shell",
"Hard-shelled sea turtles; 6 living species",
"Softshell turtles; flat, leathery shell",
"Pig-nosed turtle; only living member",
"Mud and musk turtles; small, aquatic",
"Central American river turtle; critically endangered",
"Snapping turtles; known for aggressive bite",
"Big-headed turtle; unique single species",
"Pond turtles; most diverse family",
"Asian pond and river turtles; large family",
"Tortoises; fully terrestrial turtles",
"Austro-American sideneck turtles"),
year = c("1831","1844","1843","1825","1820","1887",
"1857","1860","1831","1869","1815","1869",
"1784","1865"),
stringsAsFactors = FALSE)
# LOOPING isi turtle_list dari fallback
for (i in seq_len(nrow(turtle_fb)))
turtle_list[[i]] <- turtle_fb[i, ]
}
df_turtle <- bind_rows(turtle_list)
cat(sprintf("✅ Total data turtle: %d baris\n", nrow(df_turtle)))
write_csv(df_turtle, "turtles_raw.csv")
# ════════════════════════════════════════════════════════════
# SECTION B ── DATA HANDLING
# ════════════════════════════════════════════════════════════
cat(sprintf("📐 Baris: %d | Kolom: %d\n",
nrow(df_turtle), ncol(df_turtle)))
cat(sprintf("🏷️ Kolom: %s\n",
paste(names(df_turtle), collapse=", ")))
cat("\n📊 Tipe Data:\n"); print(sapply(df_turtle, class))
cat("\n🔍 Missing Values:\n")
print(colSums(is.na(df_turtle)))
cat(sprintf("🔁 Duplicate Rows: %d\n", sum(duplicated(df_turtle))))
cat("\n⚠️ DATA ISSUES:\n")
cat(" 1. Kolom 'year' masih character → perlu integer\n")
cat(" 2. Nama famili tidak konsisten kasusnya",
"(mixed case)\n")
cat(" 3. Deskripsi mungkin ada whitespace berlebih\n")
# ════════════════════════════════════════════════════════════
# SECTION C ── DATA CLEANING
# ════════════════════════════════════════════════════════════
df_t <- df_turtle
# LOOPING standardisasi kolom teks
text_cols <- c("name", "description")
for (col in text_cols) {
df_t[[col]] <- str_trim(str_squish(df_t[[col]]))
cat(sprintf(" ✅ '%s': trim + squish whitespace\n", col))
}
df_t$name <- str_to_title(df_t$name)
df_t$year <- suppressWarnings(as.integer(df_t$year))
# IF-ELSE: tangani missing values
if (any(is.na(df_t$name) | df_t$name == "")) {
df_t$name[is.na(df_t$name)|df_t$name==""] <- "Unknown Family"
cat(" ✅ Missing name: diisi 'Unknown Family'\n")
} else {
cat(" ✅ Kolom name: tidak ada missing value\n")
}
if (any(is.na(df_t$description) | df_t$description == "")) {
df_t$description[is.na(df_t$description)] <- "No Description"
cat(" ✅ Missing description: diisi 'No Description'\n")
} else {
cat(" ✅ Kolom description: tidak ada missing value\n")
}
if (any(is.na(df_t$year))) {
df_t$year[is.na(df_t$year)] <- 0L
cat(" ✅ Missing year: diisi 0\n")
} else {
cat(" ✅ Kolom year: tidak ada missing value\n")
}
before_t <- nrow(df_t); df_t <- df_t[!duplicated(df_t),]
cat(sprintf(" ✅ Duplikat dihapus: %d baris\n",before_t-nrow(df_t)))
cat("\n📊 Tipe Data Setelah Cleaning:\n")
print(sapply(df_t, class))
cat(sprintf("✅ Total data bersih: %d baris\n", nrow(df_t)))
# ════════════════════════════════════════════════════════════
# SECTION D ── CONDITIONAL LOGIC
# ════════════════════════════════════════════════════════════
status_turtle <- character(nrow(df_t))
# LOOPING IF-ELSE ≥3 kondisi → data_status
for (i in seq_len(nrow(df_t))) {
# Kondisi 1: nama tidak valid → beri default
if (is.na(df_t$name[i]) || df_t$name[i] == "" ||
df_t$name[i] == "Unknown Family") {
status_turtle[i] <- "Incomplete"
# Kondisi 2: deskripsi kosong → data tidak lengkap
} else if (df_t$description[i] == "No Description" ||
df_t$description[i] == "") {
status_turtle[i] <- "Incomplete"
# Kondisi 3: data valid → "Complete"
} else {
status_turtle[i] <- "Complete"
}
}
df_t$data_status <- status_turtle
cat("\n📊 Distribusi data_status:\n")
print(table(df_t$data_status))
write_csv(df_t, "turtles_cleaned.csv")
cat("💾 Disimpan: turtles_cleaned.csv\n")
cat(sprintf(" Total: %d baris x %d kolom\n",
nrow(df_t), ncol(df_t)))
print(df_t)
▶ OUTPUT — Section A: Data Collection
turtles_raw.csv.
▶ OUTPUT — Section B: Data Handling
| Kolom | Tipe (Raw) | Missing |
|---|---|---|
name
|
character | 0 |
description
|
character | 0 |
year
|
character | 0 |
-
Kolom
yearmasih character → perlu konversi ke integer - Nama famili tidak konsisten kasusnya (mixed case)
- Deskripsi mungkin ada whitespace berlebih dari scraping
▶ OUTPUT — Section C: Data Cleaning
| Kolom | Tipe Awal | Tipe Akhir | Aksi |
|---|---|---|---|
name
|
character | character | trim + squish + title case |
description
|
character | character | trim + squish whitespace |
year
|
character | integer | konversi numerik |
▶ OUTPUT — Section D: Conditional Logic
| name | description | year | status |
|---|---|---|---|
| Cryptodira | Hidden-neck; largest suborder | 1831 | Complete |
| Pleurodira | Side-necked; S. Hemisphere | 1844 | Complete |
| Dermochelyidae | Leatherback sea turtles | 1843 | Complete |
| Cheloniidae | Hard-shelled sea turtles | 1825 | Complete |
| Trionychidae | Softshell turtles | 1820 | Complete |
| Testudinidae | Tortoises; fully terrestrial | 1784 | Complete |
| Chelidae | Austro-American sideneck | 1865 | Complete |
Analytical Thinking
Evaluasi tingkat kesulitan scraping, perbedaan pendekatan teknis, insights, dan rekomendasi strategis.
Website Paling Mudah Di-Scrape
Countries of the World
(scrapethissite.com/pages/simple/) adalah website yang
paling mudah di-scrape. Seluruh data sudah tersedia dalam HTML statis
tanpa memerlukan JavaScript, AJAX, maupun autentikasi. Struktur HTML-nya
konsisten dan bersih — setiap negara dibungkus dalam elemen
div.col-md-4.country dengan child elements yang terpisah
untuk nama, ibu kota, populasi, dan luas wilayah. Satu GET request sudah
cukup untuk mendapatkan seluruh data (~250 negara) tanpa perlu navigasi
halaman tambahan.
Website Paling Sulit Di-Scrape
Turtles All The Way Down
(scrapethissite.com/pages/frames/) adalah yang paling
sulit. Kontennya berada di dalam iframe yang dimuat
secara dinamis — rvest tidak dapat mengeksekusi JavaScript
sehingga frame tidak ter-render. Mengatasi ini membutuhkan strategi dua
lapis: akses URL frame secara langsung dan penyediaan data fallback
representatif bila akses gagal. Oscar Films menjadi
runner-up karena endpoint AJAX-nya terkadang mengembalikan JSON,
terkadang HTML, membutuhkan dual-mode parser yang lebih kompleks.
Perbedaan Pendekatan Scraping
| Pendekatan | Cara Kerja | Package | Contoh |
|---|---|---|---|
| Static HTML | GET request → parse HTML langsung; semua data ada di halaman pertama |
rvest, httr
|
Countries of the World |
| Pagination |
Loop per halaman; ubah parameter URL (?page_num=N) di
setiap iterasi
|
rvest, httr
|
Hockey Teams (24 halaman) |
| AJAX |
Akses endpoint API langsung (?ajax=true&year=N); parse
JSON/HTML response
|
httr, jsonlite
|
Oscar Films (2010–2015) |
| iframe/Frames |
Akses URL frame secara langsung karena rvest tidak dapat
render JavaScript
|
rvest, httr
|
Turtles All The Way Down |
💡 Insight 1 – Kompleksitas Berkorelasi dengan Teknologi
Web Semakin modern teknologi yang digunakan suatu website
(static → pagination → AJAX → iframe), semakin kompleks teknik scraping
yang dibutuhkan. Website berbasis HTML statis hanya memerlukan satu
fungsi html_nodes(), sedangkan website berbasis AJAX dan
iframe memerlukan identifikasi endpoint tersembunyi, penanganan respons
ganda (JSON/HTML), dan strategi fallback — meningkatkan jumlah baris
kode hingga 5–10x lipat.
💡 Insight 2 – Error Handling Adalah Kunci Ketahanan
Pipeline Tanpa tryCatch(), kegagalan satu request
dapat menghentikan seluruh proses scraping. Dengan error handling yang
tepat, pipeline tetap berjalan meskipun ada halaman yang timeout atau
server mengembalikan status bukan 200. Pada scraping Hockey Teams (24
halaman) dan Oscar Films (6 tahun), pendekatan ini memastikan data yang
berhasil diambil tidak hilang hanya karena satu iterasi gagal.
💡 Insight 3 – Delay Antar Request Adalah Praktik Etis dan
Strategis Penggunaan Sys.sleep() bukan hanya soal
etiket — ini juga strategis. Server yang menerima terlalu banyak request
dalam waktu singkat dapat memblokir IP atau mengembalikan error 429 (Too
Many Requests). Delay 0.3–0.5 detik antar request pada scraping ini
memastikan keberhasilan pengambilan data sekaligus menjaga stabilitas
server target.
📌 Rekomendasi 1 – Gunakan Pendekatan Adaptif Berdasarkan Arsitektur Website Sebelum memulai scraping, selalu lakukan inspeksi terlebih dahulu menggunakan browser DevTools (Network tab) untuk mengidentifikasi apakah konten dimuat secara statis, via AJAX, atau dalam frame. Pemilihan metode yang tepat sejak awal menghemat waktu debugging secara signifikan dan menghasilkan kode yang lebih efisien.
📌 Rekomendasi 2 – Selalu Sertakan Fallback dan Validasi Data
Status Setiap pipeline scraping sebaiknya memiliki mekanisme
fallback (seperti data representatif untuk turtles) dan kolom
data_status (“Complete”/“Incomplete”) agar kualitas data
hasil scraping dapat diaudit secara transparan. Ini memungkinkan
analisis downstream tetap berjalan sambil masalah scraping diselesaikan
secara terpisah.
Final Conclusion
Rangkuman pencapaian dari seluruh pipeline data — mulai dari pembacaan file hingga web scraping.
| Section | Topik | Output | Status |
|---|---|---|---|
| A | Membaca File | 5 DataFrame dari CSV, Excel, JSON, TXT, XML | ✓ Selesai |
| A2 | Penggabungan File | 1 DataFrame gabungan ~10.000 baris | ✓ Selesai |
| B | Analisis Kualitas Data | Laporan 5 masalah kualitas data | ✓ Selesai |
| C | Data Cleaning |
ecommerce_cleaned.csv
|
✓ Selesai |
| D | Feature Engineering | 3 kolom baru: is_high_value, order_priority, valid_transaction | ✓ Selesai |
| E | Analytical Thinking | Insight platform, kategori, status transaksi | ✓ Selesai |
| E* | Analytical Thinking – Scraping | 3 insights + 2 rekomendasi analisis proses scraping | ✓ Selesai |
| F | Scraping Countries & Hockey |
countries_cleaned.csv,
hockey_teams_cleaned.csv
|
✓ Selesai |
| G/H | Scraping Oscar & Turtles |
oscar_films_cleaned.csv, turtles_cleaned.csv
|
✓ Selesai |
✅ Kesimpulan Pipeline Data Pipeline data ini telah mencakup seluruh tahapan penting dalam proses analitik modern: ingestion dari berbagai format (CSV, Excel, JSON, TXT, XML), identifikasi dan penanganan masalah kualitas data, pembersihan dan standardisasi, rekayasa fitur untuk kebutuhan bisnis, serta pengumpulan data eksternal melalui web scraping dengan berbagai teknik (static HTML, pagination, AJAX, frames). Semua tahapan mengimplementasikan struktur kontrol IF-ELSE dan looping sesuai requirement assignment.
📘 Interpretasi Final Assignment ini
mendemonstrasikan kemampuan mengelola data end-to-end menggunakan R.
Penggunaan fungsi kustom (standardisasi_platform,
bersihkan_harga, dsb.) meningkatkan reusability kode.
Penerapan looping pada cleaning dan scraping menunjukkan pemahaman
struktural yang baik. Teknik web scraping yang beragam — dari static
HTML hingga AJAX request — mencerminkan pemahaman mendalam tentang
arsitektur web modern. Dataset hasil akhir bersih, konsisten, dan siap
digunakan untuk keperluan analisis lanjutan seperti visualisasi, machine
learning, maupun reporting dashboard.