# =============================================================================
# TASK 1 | SECTION A — Data Collection
# Tujuan: Mengambil dan menggabungkan data dari berbagai file dalam Archive.zip
# =============================================================================
library(readr); library(readxl); library(jsonlite); library(tidyverse)
# ── STEP 1: Lokasi ZIP dan ekstraksi ─────────────────────────────────────────
zip_path <- "Archive.zip"
extract_dir <- "extracted_ecommerce"
if (!dir.exists(extract_dir)) dir.create(extract_dir, recursive = TRUE)
tryCatch({
unzip(zip_path, exdir = extract_dir, overwrite = TRUE)
message("Semua file berhasil diekstrak")
}, error = function(e) stop(paste("Error:", e$message)))
# ── STEP 2: Ambil semua file dari folder ─────────────────────────────────────
all_ec_files <- list.files(extract_dir, full.names = TRUE, recursive = TRUE)
all_ec_files <- all_ec_files[!grepl("__MACOSX|DS_Store", all_ec_files) &
!file.info(all_ec_files)$isdir]
cat("Total file ditemukan:", length(all_ec_files), "\n")
for (f in all_ec_files) cat("-", basename(f), "\n")
# ── STEP 3: Siapkan penyimpanan ───────────────────────────────────────────────
dataframes <- list()
col_reference <- NULL
# ── STEP 4: Fungsi baca berdasarkan format ────────────────────────────────────
read_file_by_format <- function(file) {
ext <- tolower(tools::file_ext(file))
if (ext == "csv") read_csv(file, show_col_types = FALSE)
else if (ext == "txt") read_delim(file, delim = "|", show_col_types = FALSE)
else if (ext == "xlsx") read_excel(file)
else if (ext == "json") as_tibble(fromJSON(file, flatten = TRUE))
else NULL
}
# ── STEP 5: Baca tiap file & cek struktur kolom ───────────────────────────────
for (i in seq_along(all_ec_files)) {
file_name <- basename(all_ec_files[i])
cat(sprintf("\nFile ke-%d: %s\n", i, file_name))
df <- tryCatch(read_file_by_format(all_ec_files[i]),
error = function(e) { cat("Error:", e$message, "\n"); NULL })
if (is.null(df)) { cat("Format tidak dikenali\n"); next }
cat("- Baris:", nrow(df), "| Kolom:", ncol(df), "\n")
cat("- Nama kolom:", paste(names(df), collapse=", "), "\n")
if (is.null(col_reference)) {
col_reference <- names(df); cat("→ Struktur dijadikan referensi\n")
dataframes[[length(dataframes)+1]] <- df
} else {
if (identical(names(df), col_reference)) {
cat("→ Ready to merge\n")
dataframes[[length(dataframes)+1]] <- df
} else {
cat("→ Need adjustment\n")
for (col in col_reference[!col_reference %in% names(df)]) df[[col]] <- NA
dataframes[[length(dataframes)+1]] <- df[, col_reference]
cat("→ Kolom disesuaikan — Ready to merge\n")
}
}
}
# ── STEP 6: Gabungkan dan simpan ──────────────────────────────────────────────
if (length(dataframes) > 0) {
merged_df <- bind_rows(dataframes)
write_csv(merged_df, "gabungan_seluruh_data.csv")
cat(sprintf("\nSUKSES! Data berhasil digabungkan!\nBaris: %s | Kolom: %d\n",
format(nrow(merged_df), big.mark=","), ncol(merged_df)))
} else {
cat("Tidak ada data yang bisa digabungkan.\n")
}# =============================================================================
# TASK 1 | SECTION C — Data Cleaning
# Tujuan: Membersihkan data menggunakan logika programming
# =============================================================================
df <- read_csv("gabungan_seluruh_data.csv", show_col_types = FALSE)
df_clean <- df |> distinct() # Hapus duplikat dulu
cat("Duplikat dihapus:", nrow(df) - nrow(df_clean), "baris\n")
parse_rupiah <- function(val) {
if (is.na(val)) return(NA_real_)
s <- str_remove_all(str_trim(as.character(val)), regex("Rp\\s*", ignore_case=TRUE))
suppressWarnings(as.numeric(str_remove_all(str_remove_all(s,"\\."),",")))
}
# ── STEP 1: Standardisasi platform ───────────────────────────────────────────
PLATFORM_MAP <- c("shopee"="Shopee","tokopedia"="Tokopedia","tokped"="Tokopedia",
"lazada"="Lazada","blibli"="Blibli","tiktok shop"="TikTok Shop")
standardize_platform <- function(val) {
if (is.na(val)||str_trim(as.character(val))=="") return("Unknown")
key <- str_trim(tolower(as.character(val)))
if (key %in% names(PLATFORM_MAP)) PLATFORM_MAP[[key]] else str_to_title(str_trim(as.character(val)))
}
df_clean <- df_clean |> mutate(platform = sapply(platform, standardize_platform))
# ── STEP 2: Cleaning kolom harga dan sales (loop) ─────────────────────────────
NUMERIC_COLS <- c("unit_price","gross_sales","net_sales","discount_value")
for (col in NUMERIC_COLS[NUMERIC_COLS %in% names(df_clean)]) {
rp_count <- sum(str_detect(as.character(df_clean[[col]]),regex("Rp",ignore_case=TRUE)),na.rm=TRUE)
df_clean[[col]] <- sapply(df_clean[[col]], parse_rupiah)
neg_count <- sum(df_clean[[col]] < 0, na.rm=TRUE)
df_clean[[col]] <- if_else(!is.na(df_clean[[col]]) & df_clean[[col]] < 0, 0, df_clean[[col]])
cat(sprintf("%-20s | Rp diperbaiki: %4d | negatif→0: %4d\n", col, rp_count, neg_count))
}
# ── STEP 3: Tangani missing value (fungsi IF) ─────────────────────────────────
df_clean <- df_clean |>
mutate(
payment_method = if_else(is.na(payment_method)|str_trim(payment_method)=="",
"Unknown", str_trim(payment_method)),
customer_rating = suppressWarnings(as.numeric(customer_rating)),
priority_flag = if_else(is.na(priority_flag)|str_trim(as.character(priority_flag))=="",
"Normal", str_trim(as.character(priority_flag))),
voucher_code = if_else(is.na(voucher_code)|str_trim(as.character(voucher_code))=="",
"NONE", str_trim(as.character(voucher_code))),
discount_pct = suppressWarnings(if_else(is.na(as.numeric(discount_pct)),0,as.numeric(discount_pct)))
) |>
group_by(platform) |>
mutate(customer_rating = if_else(is.na(customer_rating),
median(customer_rating,na.rm=TRUE), customer_rating)) |>
ungroup() |>
mutate(customer_rating = if_else(is.na(customer_rating),
median(customer_rating,na.rm=TRUE), customer_rating))
# ── STEP 4: Standardisasi order_status ───────────────────────────────────────
STATUS_MAP <- c("delivered"="Completed","completed"="Completed",
"cancelled"="Cancelled","cancel"="Cancelled","batal"="Cancelled",
"returned"="Returned","retur"="Returned","shipped"="Shipped",
"on delivery"="On Delivery")
standardize_status <- function(val) {
if (is.na(val)||str_trim(as.character(val))=="") return("Unknown")
key <- str_trim(tolower(as.character(val)))
if (key %in% names(STATUS_MAP)) STATUS_MAP[[key]] else str_to_title(str_trim(as.character(val)))
}
df_clean <- df_clean |> mutate(order_status = sapply(order_status, standardize_status))
# ── STEP 5: Loop cleaning 5 kolom teks ────────────────────────────────────────
TEXT_COLS <- c("category","product_name","region","customer_segment","stock_status")
for (col in TEXT_COLS[TEXT_COLS %in% names(df_clean)]) {
before <- n_distinct(df_clean[[col]], na.rm=FALSE)
df_clean[[col]] <- str_to_title(str_trim(replace_na(as.character(df_clean[[col]]),"Unknown")))
after <- n_distinct(df_clean[[col]], na.rm=FALSE)
cat(sprintf("%-20s | variasi: %d → %d\n", col, before, after))
}
cat(sprintf("\nDataset bersih: %s baris × %d kolom\n",
format(nrow(df_clean),big.mark=","), ncol(df_clean)))
write_csv(df_clean, "cleaned_ecommerce.csv")# =============================================================================
# TASK 1 | SECTION D — Conditional Logic
# Tujuan: Menerapkan logika bisnis menggunakan if / if-else
# =============================================================================
# ── STEP 1: Tandai transaksi bernilai tinggi ──────────────────────────────────
is_high_value <- function(net_sales) {
if (net_sales > 1000000) "Yes" else "No"
}
df_clean <- df_clean |> mutate(is_high_value = sapply(net_sales, is_high_value))
cat("Distribusi is_high_value:\n"); print(table(df_clean$is_high_value))
# ── STEP 2: Kategori prioritas order ─────────────────────────────────────────
order_priority <- function(net_sales) {
if (net_sales > 1000000) "High"
else if (net_sales >= 500000) "Medium"
else "Low"
}
df_clean <- df_clean |> mutate(order_priority = sapply(net_sales, order_priority))
cat("\nDistribusi order_priority:\n"); print(table(df_clean$order_priority))
# ── STEP 3: Validitas transaksi ───────────────────────────────────────────────
valid_transaction <- function(status) {
if (status == "Cancelled") "Invalid" else "Valid"
}
df_clean <- df_clean |> mutate(valid_transaction = sapply(order_status, valid_transaction))
cat("\nDistribusi valid_transaction:\n"); print(table(df_clean$valid_transaction))
cat(sprintf("\n3 kolom baru dibuat: is_high_value, order_priority, valid_transaction\n"))
cat(sprintf("Dataset akhir: %s baris × %d kolom\n",
format(nrow(df_clean),big.mark=","), ncol(df_clean)))Archive.zip yang berisi beberapa file
dengan format berbeda. Proses dilakukan dalam 6 langkah terstruktur:
unzip())
list.files())
list() kosong sebagai wadah
dataframe dan col_reference
read_file_by_format(): membaca CSV,
TXT (pipe-sep), XLSX, JSON
bind_rows() gabungkan semua dataframe →
simpan gabungan_seluruh_data.csv
zipfile.ZipFile().extractall() → unzip()
os.listdir() → list.files()
pd.read_csv() → read_csv() (readr)
pd.read_excel() → read_excel() (readxl)
pd.concat(dataframes) → bind_rows(dataframes)
df.reindex(columns=ref) → reorder kolom + isi
NA untuk kolom yang hilang
Dataset gabungan : 0 baris × 0 kolom
File sumber : 0 file
Baris duplikat : 0 (0.0%)
| Kolom | Tipe Data | Non-Null Count |
|---|---|---|
| NA | NA | NA |
| :—– | :——— | :————– |
Tidak ada missing value pada dataset gabungan.
unit_price,
gross_sales, net_sales,
discount_value mengandung format rupiah teks seperti
Rp 100.000
net_sales bernilai
negatif; secara bisnis nilai penjualan tidak boleh negatif
‘Shopee’,
‘SHOPEE’, ’ shopee ’ adalah entitas yang sama
namun ditulis berbeda
‘DELIVERED’,
‘Delivered’, ‘completed’ perlu diseragamkan
customer_rating,
priority_flag, voucher_code,
discount_pct
PLATFORM_MAP → title case jika tidak ada di map
Rp dan
titik ribuan → as.numeric() → nilai negatif diganti 0
if_else():
payment→“Unknown”, customer_rating→median per
platform, priority→“Normal”, voucher→“NONE”,
discount_pct→0
STATUS_MAP → title case
replace_na(“Unknown”) →
str_trim() → str_to_title()
if net_sales > 1.000.000 → “Yes”,
selain itu “No”
case_when(): >1jt → “High” |
≥500rb → “Medium” | else → “Low”
if order_status == “Cancelled” →
“Invalid”, selain itu “Valid”
Platform yang paling dominan adalah — karena memiliki jumlah order tertinggi. Mayoritas transaksi customer terjadi melalui —.
Category dengan transaksi terbanyak adalah — — dianggap sebagai produk dengan demand tertinggi dalam dataset.
Status terbanyak adalah —. Jika Completed, ini sinyal positif bahwa mayoritas transaksi berhasil diselesaikan dan proses order berjalan dengan baik.
# =============================================================================
# TASK 1 | WEB 1 | SECTION A — Static Scraping
# URL: https://www.scrapethissite.com/pages/simple/
# =============================================================================
library(rvest); library(httr); library(tidyverse)
url_countries <- "https://www.scrapethissite.com/pages/simple/"
# Kirim request
res_c <- GET(url_countries, timeout(30), user_agent("Mozilla/5.0"))
soup_c <- read_html(content(res_c, as = "text", encoding = "UTF-8"))
# Ambil semua elemen negara (ekuivalen soup.select("div.country"))
countries_nodes <- soup_c %>% html_nodes("div.country")
# Looping ekstraksi per elemen negara
data_negara <- map_df(countries_nodes, function(node) {
tibble(
country_name = node %>% html_node("h3.country-name") %>% html_text(trim=TRUE),
capital = node %>% html_node("span.country-capital") %>% html_text(trim=TRUE),
population = node %>% html_node("span.country-population") %>%
html_text(trim=TRUE) %>% as.numeric(),
area_km2 = node %>% html_node("span.country-area") %>%
html_text(trim=TRUE) %>% as.numeric()
)
})
cat("Jumlah data yang diambil:", nrow(data_negara), "\n")
write_csv(data_negara, "countries_of_world.csv")
cat("Data berhasil disimpan ke file countries_of_world.csv\n")# =============================================================================
# TASK 1 | WEB 1 | SECTION B — Data Handling Report
# =============================================================================
df_raw <- data_negara
cat("SECTION B - DATA HANDLING\n", strrep("=", 50), "\n")
cat("Jumlah baris:", nrow(df_raw), "\n")
cat("Jumlah kolom:", ncol(df_raw), "\n")
cat("Nama kolom :", paste(names(df_raw), collapse=", "), "\n")
cat("Tipe data :\n"); print(sapply(df_raw, class))
# Missing values
cat("\nMissing values standar:\n")
print(colSums(is.na(df_raw)))
# Missing placeholder teks
missing_placeholders <- c("","None","N/A","NA","null","-")
for (col in names(df_raw)) {
if (is.character(df_raw[[col]])) {
n <- sum(df_raw[[col]] %in% missing_placeholders, na.rm=TRUE)
cat(col, ":", n, "placeholder\n")
} else cat(col, ": bukan kolom teks\n")
}
# Duplikat
cat("\nJumlah duplicate data:", sum(duplicated(df_raw)), "\n")
# =============================================================================
# TASK 1 | WEB 1 | SECTION C — Data Cleaning
# =============================================================================
cat("\nSECTION C - DATA CLEANING\n", strrep("=", 50), "\n")
df_clean <- df_raw
# 1. Standardisasi teks (looping)
text_columns <- names(df_clean)[sapply(df_clean, is.character)]
for (col in text_columns) {
df_clean[[col]] <- str_trim(str_to_title(as.character(df_clean[[col]])))
cat("Membersihkan kolom teks:", col, "\n")
}
# 2. Validasi nilai negatif
for (idx in 1:nrow(df_clean)) {
if (!is.na(df_clean$population[idx]) && df_clean$population[idx] < 0) {
df_clean$population[idx] <- NA
cat("Baris", idx, ": population negatif → NA\n")
}
if (!is.na(df_clean$area_km2[idx]) && df_clean$area_km2[idx] < 0) {
df_clean$area_km2[idx] <- NA
cat("Baris", idx, ": area_km2 negatif → NA\n")
}
}
# 3. Handle missing value dengan looping + if
for (col in names(df_clean)) {
n_missing <- sum(is.na(df_clean[[col]]))
if (n_missing > 0) {
if (col == "capital") {
df_clean[[col]] <- replace_na(df_clean[[col]], "No Capital / Not Available")
cat("capital: diisi 'No Capital / Not Available'\n")
} else if (col %in% c("population","area_km2")) {
df_clean <- df_clean %>% drop_na(all_of(col))
cat(col, ": baris dengan NA dihapus\n")
} else {
df_clean[[col]] <- replace_na(df_clean[[col]], "Unknown")
}
}
}
# 4. Hapus duplikat
df_clean <- distinct(df_clean)
# =============================================================================
# TASK 1 | WEB 1 | SECTION D — Conditional Logic
# =============================================================================
cat("\nSECTION D - CONDITIONAL LOGIC\n", strrep("=", 50), "\n")
get_data_status <- function(row) {
required <- c("country_name","capital","population","area_km2")
for (col in required) {
val <- row[[col]]
if (is.na(val) || (is.character(val) &&
(str_trim(val) == "" || val == "No Capital / Not Available")))
return("Incomplete")
}
return("Complete")
}
df_clean$data_status <- apply(df_clean, 1, get_data_status)
cat("Kolom 'data_status' berhasil dibuat.\n")
cat("Distribusi data_status:\n")
print(table(df_clean$data_status))
write_csv(df_clean, "countries_of_world_clean.csv")
cat("\nData clean tersimpan ke countries_of_world_clean.csv\n")Website scrapethissite.com/pages/simple/ adalah contoh
static scraping paling bersih. Seluruh data negara sudah tersedia di
HTML saat pertama kali di-load — tidak ada JavaScript, tidak ada
pagination, tidak ada autentikasi.
GET request sudah cukup untuk mendapatkan seluruh
dataset
html_nodes(“div.country”) mengumpulkan semua kartu negara
sekaligus
map_df() melakukan looping ekstraksi per kartu secara
fungsional
Ini adalah website paling mudah di-scrape dalam seluruh proyek ini.
soup.select(“div.country”) setara dengan
html_nodes(“div.country”)
select_one(“h3.country-name”).get_text(strip=True) setara
dengan html_node(“h3.country-name”) %>%
html_text(trim=TRUE)
pd.to_numeric(…, errors=“coerce”) setara dengan
suppressWarnings(as.numeric(…))
df.to_csv(…) setara dengan write_csv(…)
# =============================================================================
# TASK 2 | SECTION A — Static Scraping dengan Pagination
# URL: https://www.scrapethissite.com/pages/forms/
# =============================================================================
library(rvest); library(httr); library(tidyverse)
base_url_hockey <- "https://www.scrapethissite.com/pages/forms/"
extract_hockey_page <- function(page_html) {
rows <- page_html %>% html_nodes("tr.team")
if (length(rows) == 0) return(data.frame())
map_df(rows, function(row) {
nm <- row %>% html_node(".name") %>% html_text(trim = TRUE)
pct <- row %>% html_node(".pct") %>% html_text(trim = TRUE)
# Section D — Conditional Logic
status <- if (is.na(nm) || nm == "" || is.na(pct) || pct == "") "Incomplete" else "Complete"
data.frame(
team_name = ifelse(is.na(nm) || nm == "", "Unknown", nm),
year = row %>% html_node(".year") %>% html_text(trim=TRUE) %>% as.integer(),
wins = row %>% html_node(".wins") %>% html_text(trim=TRUE) %>% as.integer(),
losses = row %>% html_node(".losses") %>% html_text(trim=TRUE) %>% as.integer(),
win_pct = as.numeric(pct),
data_status = status,
scraped_at = format(Sys.time(), "%Y-%m-%d %H:%M:%S"),
stringsAsFactors = FALSE
)
})
}
# Loop 24 halaman dengan stop adaptif
all_hockey <- list()
for (p in 1:24) {
resp <- tryCatch(
GET(paste0(base_url_hockey, "?page_num=", p), timeout(30),
user_agent("Mozilla/5.0 (R Academic Bot)")),
error = function(e) NULL
)
if (!is.null(resp) && status_code(resp) == 200) {
pg <- extract_hockey_page(read_html(resp))
if (nrow(pg) == 0) break
all_hockey[[p]] <- pg
message("Halaman ", p, " berhasil: ", nrow(pg), " baris")
}
Sys.sleep(0.8)
}
hockey_df <- bind_rows(all_hockey)
write_csv(hockey_df, "hockey_teams_data.csv")
message("Saved: ", nrow(hockey_df), " baris")Data hockey tersebar di 24 halaman melalui parameter
?page_num=N. Strategi yang digunakan adalah loop adaptif —
lebih robust daripada hardcode jumlah halaman:
tryCatch() — isolasi error koneksi per halaman
if (nrow(pg) == 0) break — stop otomatis saat
halaman kosong
bind_rows(all_hockey) — gabung semua halaman menjadi satu
tibble
Sys.sleep(0.8) — etika scraping dan pencegahan IP-ban
# =============================================================================
# TASK 2 | SECTION A — AJAX Scraping Oscar Winners 2010-2015
# URL: https://www.scrapethissite.com/pages/ajax-javascript/
# Perbaikan error: tambah header AJAX + validasi is.data.frame() + fallback
# =============================================================================
library(httr); library(jsonlite); library(tidyverse)
api_base_oscar <- "https://www.scrapethissite.com/pages/ajax-javascript/?ajax=true&year="
clean_oscar <- function(df_raw, yr) {
df_raw %>%
mutate(
year_event = as.integer(yr),
data_status = case_when(
is.na(title) ~ "Incomplete",
is.na(nominations) ~ "Incomplete",
nominations == 0 ~ "Check Required",
TRUE ~ "Complete"
),
title = if_else(is.na(title), "Untitled", title)
)
}
all_oscar <- list()
for (yr in 2010:2015) {
res <- tryCatch(
GET(paste0(api_base_oscar, yr), timeout(30),
user_agent("Mozilla/5.0 (R Academic Bot)"),
add_headers(
Accept = "application/json, text/javascript, */*; q=0.01",
`X-Requested-With` = "XMLHttpRequest", # Header kritis agar server merespons JSON
Referer = "https://www.scrapethissite.com/pages/ajax-javascript/"
)),
error = function(e) NULL
)
if (!is.null(res) && status_code(res) == 200) {
raw <- tryCatch(content(res, as="text", encoding="UTF-8"), error=function(e) NULL)
if (!is.null(raw) && nchar(trimws(raw)) > 2) {
json <- tryCatch(fromJSON(raw, simplifyDataFrame=TRUE), error=function(e) NULL)
if (!is.null(json) && is.data.frame(json) && nrow(json) > 0) # Validasi wajib
all_oscar[[as.character(yr)]] <- clean_oscar(json, yr)
}
}
Sys.sleep(0.8)
}
# Gabungkan dengan fallback aman
oscar_df <- if (length(all_oscar) > 0) bind_rows(all_oscar) else {
tibble(title=character(), year_event=integer(), nominations=integer(),
best_picture=logical(), data_status=character())
}
write_csv(oscar_df, "oscar_winners_data.csv")Halaman Oscar Winners memuat film setelah HTML utama di-render. Konten tidak ada di source HTML — JavaScript menyuntikkannya secara asinkron ke endpoint terpisah.
Cara menemukan endpoint:?ajax=true&year={YYYY}
Error terjadi karena server tidak selalu merespons dengan JSON yang
valid ketika request tidak menyertakan header AJAX yang benar. Server
dapat merespons dengan HTML error page atau teks kosong, menyebabkan
fromJSON() gagal dan bind_rows() menerima
objek yang bukan data.frame.
X-Requested-With:
XMLHttpRequest dan Accept: application/json serta
Referer yang sesuai
is.data.frame(json)
dan nrow(json) > 0 sebelum memanggil
bind_rows()
# =============================================================================
# TASK 1 | WEB 2 | SECTION A — iFrame Scraping
# URL: https://www.scrapethissite.com/pages/frames/
# =============================================================================
library(rvest); library(httr); library(tidyverse)
# STEP 1: Akses Halaman Utama
url_utama <- "https://www.scrapethissite.com/pages/frames/"
res <- GET(url_utama, timeout(30), user_agent("Mozilla/5.0"))
soup <- read_html(content(res, as = "text", encoding = "UTF-8"))
# STEP 2: Navigasi iFrame — cari dan ekstrak URL iframe
# Ekuivalen: soup.find("iframe", src=lambda x: x and "frame=i" in x)
iframe_node <- soup %>% html_node(xpath = "//iframe[contains(@src, 'frame=i')]")
iframe_src <- html_attr(iframe_node, "src")
iframe_url <- paste0("https://www.scrapethissite.com", iframe_src)
print(paste("Berhasil menemukan URL iFrame:", iframe_url))
# STEP 3: Masuk ke dalam iFrame — request langsung ke URL iframe
res_iframe <- GET(iframe_url, timeout(30), user_agent("Mozilla/5.0"))
soup_iframe <- read_html(content(res_iframe, as = "text", encoding = "UTF-8"))# =============================================================================
# TASK 1 | WEB 2 | SECTION B — Ekstraksi + Deep Scraping
# =============================================================================
# Kumpulkan semua kartu kura-kura
turtle_cards <- soup_iframe %>% html_nodes("div.turtle-family-card")
raw_data <- list()
for (i in seq_along(turtle_cards)) {
card <- turtle_cards[[i]]
# Ekstraksi Nama
nm_node <- card %>% html_node("h3")
nm_val <- if (!is.null(nm_node)) html_text(nm_node, trim=TRUE) else "Unknown"
# Ekstraksi Link
a_node <- card %>% html_node("a")
if (!is.null(a_node)) {
href <- html_attr(a_node, "href")
link_val <- if (!is.na(href) && startsWith(href, "/")) {
paste0("https://www.scrapethissite.com", href)
} else href
} else { link_val <- "No link" }
# Deep Scraping: Deskripsi dari halaman detail
desc_val <- "Unknown"
if (link_val != "No link") {
det <- tryCatch(GET(link_val, timeout(10), user_agent("Mozilla/5.0")),
error = function(e) NULL)
if (!is.null(det) && status_code(det) == 200) {
p_node <- read_html(content(det, as="text")) %>% html_node("p")
if (!is.null(p_node)) {
txt <- html_text(p_node, trim=TRUE)
if (nchar(txt) > 0) desc_val <- txt
}
} else desc_val <- "Error fetching"
}
raw_data[[i]] <- tibble(Name=nm_val, Description=desc_val, Additional_Info=link_val)
Sys.sleep(0.5)
}
df_turtles_raw <- bind_rows(raw_data)
cat("Selesai mengumpulkan", nrow(df_turtles_raw), "data kura-kura.\n")
# DATA HANDLING REPORT
cat("--- DATA HANDLING REPORT ---\n")
cat("Jumlah Baris & Kolom:", paste(dim(df_turtles_raw), collapse=" x "), "\n")
cat("Nama Kolom:", paste(names(df_turtles_raw), collapse=", "), "\n")
cat("Tipe Data:\n"); print(sapply(df_turtles_raw, class))
# Missing values (placeholder)
missing_vals <- sapply(names(df_turtles_raw), function(col)
sum(df_turtles_raw[[col]] %in% c("Unknown","No link","No description available",
"No additional info","Error fetching")))
cat("Missing Values (placeholder):\n"); print(missing_vals)
cat("Jumlah duplicate:", sum(duplicated(df_turtles_raw)), "\n")Sebuah iframe menyematkan dokumen HTML yang sepenuhnya
terpisah di dalam halaman utama. Data kura-kura tidak berada di HTML
halaman utama — ia ada di dokumen terpisah yang di-load ke dalam bingkai
tersebut.
iframe dan ekstrak atribut
src menggunakan XPath
Setelah URL iframe ditemukan, kontennya bisa di-scrape seperti halaman biasa — tidak perlu browser automation selama iframe tidak menggunakan JavaScript tambahan.
Kartu di halaman iframe hanya berisi nama dan link. Deskripsi lengkap ada di halaman detail masing-masing kura-kura. Untuk setiap kartu, program melakukan satu lagi request ke halaman detailnya:
tryCatch() — isolasi error koneksi per kartu agar loop
tidak crash
Sys.sleep(0.5) — etika scraping antar request
html_node(“p”) — ambil paragraf pertama sebagai deskripsi
Ini adalah website paling sulit dalam proyek ini karena membutuhkan multiple HTTP requests berlapis.
# =============================================================================
# TASK 1 | WEB 2 | SECTION C & D — Cleaning + Conditional Labeling
# =============================================================================
# Ringkasan data mentah
cat("--- DATA HANDLING REPORT (Ringkasan) ---\n")--- DATA HANDLING REPORT (Ringkasan) ---
Baris x Kolom: 14 x 3
missing_placeholder <- sapply(names(df_turtles_raw), function(col)
sum(df_turtles_raw[[col]] %in% c("Unknown","No link","Error fetching")))
cat("Missing (placeholder):\n"); print(missing_placeholder)Missing (placeholder):
Name Description Additional_Info
0 0 0
Duplikat: 0
Cleaning dan labeling selesai.
Distribusi data_status:
Complete
14
File 'turtles_scraped_final.csv' telah siap dikumpulkan.
# Preview
df_turtles %>%
select(Name, Description, data_status) %>%
head(5) %>%
mutate(Description = str_trunc(Description, 70)) %>%
kable(caption = "5 Baris Pertama Data Turtles (Setelah Cleaning)") %>%
kable_styling(bootstrap_options = c("striped","hover"), font_size = 14) %>%
column_spec(1, bold = TRUE, color = "#0D3B66") %>%
column_spec(3, bold = TRUE)| Name | Description | data_status |
|---|---|---|
| Carettochelyidae | The Carettochelyidae family of turtles — more commonly known as “Pi… | Complete |
| Cheloniidae | The Cheloniidae family of turtles — more commonly known as “Sea tur… | Complete |
| Chelydridae | The Chelydridae family of turtles — more commonly known as “Snappin… | Complete |
| Dermatemydidae | The Dermatemydidae family of turtles — more commonly known as “Cent… | Complete |
| Dermochelyidae | The Dermochelyidae family of turtles — more commonly known as “Leat… | Complete |
# =============================================================================
# TASK 2 | SECTION B — Data Handling Report
# Perbaikan error pivot_longer: gunakan map_dfr() per kolom
# =============================================================================
qa_hockey <- map_dfr(names(hockey_df), function(col) {
x <- hockey_df[[col]]
tibble(
Kolom = col,
`Tipe Data` = class(x)[1],
`Missing (#)` = sum(is.na(x)),
`Missing (%)` = round(mean(is.na(x)) * 100, 2),
Status = if_else(sum(is.na(x)) == 0, "Bersih", "Ada NA")
)
})
qa_hockey %>%
arrange(desc(`Missing (#)`)) %>%
kable(caption = "Audit Kualitas Data — Hockey Teams Dataset") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = TRUE, font_size = 14) %>%
column_spec(1, bold = TRUE, color = "#0D3B66") %>%
column_spec(5, bold = TRUE) %>%
row_spec(which(qa_hockey %>%
arrange(desc(`Missing (#)`)) %>% pull(`Missing (#)`) > 0),
background = "#FFF8F0")| Kolom | Tipe Data | Missing (#) | Missing (%) | Status |
|---|---|---|---|---|
| team_name | character | 0 | 0 | Bersih |
| year | integer | 0 | 0 | Bersih |
| wins | integer | 0 | 0 | Bersih |
| losses | integer | 0 | 0 | Bersih |
| win_pct | numeric | 0 | 0 | Bersih |
| data_status | character | 0 | 0 | Bersih |
| scraped_at | character | 0 | 0 | Bersih |
# =============================================================================
# TASK 2 | SECTION C — Data Cleaning Sistematis
# =============================================================================
hockey_clean <- hockey_df %>%
mutate(across(where(is.numeric), ~ replace_na(.x, median(.x, na.rm=TRUE)))) %>%
mutate(across(where(is.character), ~ replace_na(.x, "Unknown"))) %>%
mutate(
year = as.integer(year),
wins = as.integer(wins),
losses = as.integer(losses),
win_pct = as.numeric(win_pct)
) %>%
distinct()
cat("Baris sebelum cleaning :", nrow(hockey_df), "\n")Baris sebelum cleaning : 557
Baris sesudah cleaning : 557
Missing values tersisa : 0
Sebelum analisis apapun, tiga aspek wajib diverifikasi:
Perbaikan error pivot_longer: Gunakan
map_dfr() — setiap kolom diproses secara independen
menghasilkan tibble bertipe homogen. Tidak ada konflik tipe antara kolom
numeric dan character.
across() adalah cara paling idiomatis di R modern untuk
menerapkan fungsi yang sama ke banyak kolom:
where(is.numeric) — seleksi kolom numerik secara deklaratif
replace_na(.x, median(.x)) — imputasi median lebih robust
dari mean untuk distribusi skewed
where(is.character) — seleksi kolom teks
replace_na(.x, “Unknown”) — placeholder eksplisit
distinct() — deduplication otomatis
Kode identik untuk 5 maupun 50 kolom — tidak perlu modifikasi.
# =============================================================================
# TASK 2 | SECTION D — Conditional Logic
# =============================================================================
hockey_flagged <- hockey_clean %>%
mutate(
# case_when: multi-kondisi bertingkat
quality_flag = case_when(
is.na(win_pct) ~ "Missing Win%",
win_pct >= 0.65 ~ "Elite (65% ke atas)",
win_pct >= 0.55 ~ "Above Average",
win_pct >= 0.45 ~ "Average",
win_pct >= 0.35 ~ "Below Average",
TRUE ~ "Poor (di bawah 35%)"
),
# if_else: kondisi biner
tier = if_else(wins >= 40, "Top Tier", "Lower Tier"),
win_label = if_else(win_pct >= 0.5, "Winning Season", "Losing Season")
)
hockey_flagged %>%
count(quality_flag, name = "Jumlah") %>%
arrange(desc(Jumlah)) %>%
kable(caption = "Distribusi Quality Flag — Hockey Teams") %>%
kable_styling(bootstrap_options = c("striped","hover"), font_size = 14) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2, bold = TRUE, color = "#0D3B66")| quality_flag | Jumlah |
|---|---|
| Average | 216 |
| Below Average | 160 |
| Above Average | 91 |
| Poor (di bawah 35%) | 84 |
| Elite (65% ke atas) | 6 |
| Kriteria | Static | Pagination | AJAX / JSON | iFrame |
|---|---|---|---|---|
| Kompleksitas Setup | Rendah | Sedang | Sedang | Tinggi |
| Kecepatan Parsing | Cepat | Sedang | Sangat Cepat | Lambat |
| Perlu JavaScript | Tidak | Tidak | Ya | Ya* |
| Stabilitas | Tinggi | Tinggi | Sedang | Rendah |
| Library R | rvest + httr | rvest + for loop | httr + jsonlite | httr + rvest |
| Risiko Deteksi | Rendah | Sedang | Sedang | Tinggi |
| Format Output | HTML tabel | HTML multi-hal. | JSON terstruktur | Variatif |
| Contoh di Proyek Ini | Countries of the World | Hockey Teams (24 hal.) | Oscar Winners | Turtle Families |
scrapethissite.com/pages/simple/ adalah halaman HTML statis satu halaman yang berisi seluruh data negara tanpa pagination, tanpa JavaScript, dan tanpa autentikasi. Satu GET request tunggal sudah menghasilkan seluruh data. Struktur DOM sangat konsisten dan tidak ada mekanisme anti-bot sama sekali.
scrapethissite.com/pages/frames/ (data kura-kura) adalah yang paling sulit dalam proyek ini. Membutuhkan dua lapisan HTTP request — pertama untuk mendapatkan URL iframe dari halaman utama, kedua untuk mengakses konten iframe itu sendiri. Ditambah lagi deep scraping ke halaman detail per kura-kura yang menambah jumlah request secara signifikan.
Static — HTML sudah ada saat request pertama. Satu GET request cukup. Paling sederhana, paling reproducible. Contoh: Countries of the World.
Pagination — Konten dibagi ke banyak halaman via parameter URL. Loop for dengan kondisi break adaptif. Masih menggunakan rvest, tidak butuh JavaScript. Contoh: Hockey Teams 24 halaman.
AJAX / JSON — Data dimuat asinkron setelah halaman di-render. Endpoint JSON ditemukan via Network Tab DevTools. Gunakan httr dengan header AJAX yang benar ditambah jsonlite. Hasilnya langsung dataframe bersih. Contoh: Oscar Winners.
iFrame — Konten dari dokumen HTML terpisah yang di-embed dalam bingkai. Cari URL src dari tag iframe, lakukan GET request terpisah ke URL tersebut. Jika konten iframe tidak menggunakan JavaScript tambahan, tidak perlu browser automation. Contoh: Turtle Families.
Insight 1 — Pre-Processing dan Post-Processing adalah Kewajiban: Dari semua dataset dalam proyek ini, tidak ada satu pun yang langsung siap dianalisis tanpa cleaning. Tahapan seperti penghapusan duplikat, standarisasi format teks, penanganan nilai negatif, dan pengisian missing values sangat krusial untuk menghasilkan data yang akurat.
Insight 2 — Kompleksitas Scraping Berbanding Lurus dengan Dinamika Konten: Website statis (Countries) sangat mudah, sedangkan iFrame (Turtles) membutuhkan pendekatan berlapis. Semakin kompleks cara website menampilkan konten, semakin kompleks logika scraping yang dibutuhkan.
Insight 3 — Header HTTP Menentukan Keberhasilan AJAX Scraping: Error pada AJAX scraping sering bukan karena kode salah, melainkan header HTTP tidak lengkap. Server modern memeriksa header X-Requested-With dan Accept untuk memvalidasi bahwa request berasal dari konteks AJAX yang legitimate sebelum merespons dengan JSON.
Rekomendasi 1 — Selalu Terapkan Error Handling yang Robust: Dalam web scraping, banyak hal bisa salah: koneksi terputus, elemen HTML berubah, server memblokir request. Blok tryCatch() untuk setiap HTTP request dan pengecekan keberadaan elemen sebelum ekstraksi adalah praktik wajib agar skrip tidak crash di tengah jalan.
Rekomendasi 2 — Hormati Kebijakan Website dan Batasi Frekuensi Request: Sebelum scraping, periksa file robots.txt website. Tambahkan Sys.sleep() antar request untuk menghindari overload server dan risiko IP diblokir. Gunakan user_agent() yang jujur sebagai identifikasi. Praktik scraping yang etis dan bertanggung jawab sangat penting untuk keberlanjutan proses pengumpulan data.
| Teknik | Dataset | Library R | Kesulitan | Level |
|---|---|---|---|---|
| Static | Countries of the World | rvest | Mudah | Pemula |
| Pagination | Hockey Teams (24 hal.) | rvest + loop | Sedang | Intermediate |
| AJAX JSON | Oscar Winners | httr + jsonlite | Sedang | Intermediate |
| iFrame | Turtle Families | httr + rvest | Sulit | Advanced |