NIM: 52250030
UTS PEMROGRAMAN SAINS DATA I Sins Data Kelas A
Tujuan pengerjaan mini project adalah:
Membaca 5 file berbeda (CSV, JSON, TXT, XLSX, XML), mengecek struktur kolom, dan menggabungkan data yang kompatibel menggunakan looping dan IF/IF-ELSE.
# ── MEMBACA 5 FILE DAN MENGGABUNGKAN YANG STRUKTURNYA SAMA ──
# Daftar file
files_info <- list(
csv = list(file = "ecommerce.csv", type = "csv"),
xlsx = list(file = "ecommerce.xlsx", type = "xlsx"),
json = list(file = "ecommerce.json", type = "json"),
txt = list(file = "ecommerce.txt", type = "txt"),
xml = list(file = "ecommerce.xml", type = "xml")
)
# Fungsi baca file
baca_file <- function(file, type) {
if (!file.exists(file)) stop(paste("File tidak ditemukan:", file))
if (type == "csv") return(read.csv(file, stringsAsFactors = FALSE))
if (type == "xlsx") return(read_excel(file))
if (type == "json") return(fromJSON(file))
if (type == "txt") return(read.delim(file, sep = "\t", stringsAsFactors = FALSE))
if (type == "xml") {
doc <- read_xml(file)
# Sesuaikan tag: misal <Record> atau <row>
records <- xml_find_all(doc, ".//Record")
if (length(records) == 0) records <- xml_find_all(doc, ".//row")
df_list <- lapply(records, function(x) {
children <- xml_children(x)
values <- xml_text(children)
names <- xml_name(children)
as.list(setNames(values, names))
})
return(bind_rows(df_list))
}
stop("Format tidak dikenal")
}
# Baca semua file, simpan daftar kolom
raw_data <- list()
kolom_per_file <- list()
for (fmt in names(files_info)) {
cat(sprintf("\n── Membaca file: %s\n", files_info[[fmt]]$file))
df <- baca_file(files_info[[fmt]]$file, files_info[[fmt]]$type)
cat(sprintf(" Baris : %d | Kolom : %d\n", nrow(df), ncol(df)))
cat(" Kolom :", paste(colnames(df), collapse = ", "), "\n")
raw_data[[fmt]] <- df
kolom_per_file[[fmt]] <- sort(colnames(df))
}
# Kelompokkan file berdasarkan kesamaan struktur kolom
struktur_unik <- unique(kolom_per_file)
cat("\n=== Hasil Pengecekan Struktur ===\n")
for (i in seq_along(struktur_unik)) {
file_dengan_struktur <- names(which(sapply(kolom_per_file, function(x) identical(x, struktur_unik[[i]]))))
cat(sprintf("Struktur %d (%d kolom): %s\n", i, length(struktur_unik[[i]]),
paste(struktur_unik[[i]], collapse=", ")))
cat(sprintf(" File: %s\n", paste(file_dengan_struktur, collapse=", ")))
if (length(file_dengan_struktur) > 1) {
cat(" Status: Ready to merge \n")
} else {
cat(" Status: Need adjustment (hanya satu file, tidak bisa digabung)\n")
}
}
# Gabungkan file-file yang memiliki struktur sama
merged_dfs <- list()
for (i in seq_along(struktur_unik)) {
struct <- struktur_unik[[i]]
files_in_group <- names(which(sapply(kolom_per_file, function(x) identical(x, struct))))
if (length(files_in_group) >= 2) {
cat(sprintf("\nMenggabungkan %d file dengan struktur yang sama: %s\n",
length(files_in_group), paste(files_in_group, collapse=", ")))
# Ambil data frames, konversi semua kolom ke character untuk hindari konflik tipe
df_list <- lapply(files_in_group, function(f) {
df <- raw_data[[f]]
df <- as.data.frame(lapply(df, as.character), stringsAsFactors = FALSE)
return(df)
})
merged <- bind_rows(df_list)
# Simpan dengan nama berdasarkan struktur kolom
key <- paste(sort(struct), collapse="_")
merged_dfs[[key]] <- merged
cat(sprintf("Hasil gabungan: %d baris, %d kolom\n", nrow(merged), ncol(merged)))
}
}
# Pilih dataset utama (misal yang pertama kali berhasil digabung)
if (length(merged_dfs) > 0) {
main_df <- merged_dfs[[1]]
cat(sprintf("\n Dataset utama (gabungan) memiliki %d baris dan %d kolom\n", nrow(main_df), ncol(main_df)))
cat("Nama kolom dalam dataset utama:", paste(colnames(main_df), collapse=", "), "\n")
# Konversi tipe data jika kolom numerik ada
if ("unit_price" %in% colnames(main_df)) {
main_df$unit_price <- as.numeric(as.character(main_df$unit_price))
}
if ("net_sales" %in% colnames(main_df)) {
main_df$net_sales <- as.numeric(as.character(main_df$net_sales))
}
if ("customer_rating" %in% colnames(main_df)) {
main_df$customer_rating <- as.numeric(as.character(main_df$customer_rating))
}
} else {
cat("\n Tidak ada kelompok file dengan struktur yang sama untuk digabung.\n")
cat("Silakan periksa kembali file dataset. Pastikan minimal 2 file memiliki kolom identik.\n")
}
##
## ── Membaca file: ecommerce.csv
## Baris : 2000 | Kolom : 22
## Kolom : order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
##
## ── Membaca file: ecommerce.xlsx
## Baris : 2000 | Kolom : 22
## Kolom : order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
##
## ── Membaca file: ecommerce.json
## Baris : 2000 | Kolom : 22
## Kolom : order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
##
## ── Membaca file: ecommerce.txt
## Baris : 2000 | Kolom : 1
## Kolom : order_id.order_date.ship_date.platform.category.product_name.unit_price.quantity.gross_sales.campaign.voucher_code.discount_pct.discount_value.shipping_cost.net_sales.payment_method.customer_segment.region.stock_status.order_status.customer_rating.priority_flag
##
## ── Membaca file: ecommerce.xml
## Baris : 2000 | Kolom : 22
## Kolom : order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
##
## === Hasil Pengecekan Struktur ===
## Struktur 1 (22 kolom): campaign, category, customer_rating, customer_segment, discount_pct, discount_value, gross_sales, net_sales, order_date, order_id, order_status, payment_method, platform, priority_flag, product_name, quantity, region, ship_date, shipping_cost, stock_status, unit_price, voucher_code
## File: csv, xlsx, json, xml
## Status: Ready to merge
## Struktur 2 (1 kolom): order_id.order_date.ship_date.platform.category.product_name.unit_price.quantity.gross_sales.campaign.voucher_code.discount_pct.discount_value.shipping_cost.net_sales.payment_method.customer_segment.region.stock_status.order_status.customer_rating.priority_flag
## File: txt
## Status: Need adjustment (hanya satu file, tidak bisa digabung)
##
## Menggabungkan 4 file dengan struktur yang sama: csv, xlsx, json, xml
## Hasil gabungan: 8000 baris, 22 kolom
##
## Dataset utama (gabungan) memiliki 8000 baris dan 22 kolom
## Nama kolom dalam dataset utama: order_id, order_date, ship_date, platform, category, product_name, unit_price, quantity, gross_sales, campaign, voucher_code, discount_pct, discount_value, shipping_cost, net_sales, payment_method, customer_segment, region, stock_status, order_status, customer_rating, priority_flag
cat("── Dimensi Dataset ─────────────────────────────────\n")
cat(sprintf(" Baris : %d\n Kolom : %d\n\n", nrow(main_df), ncol(main_df)))
cat("── Tipe Data per Kolom ─────────────────────────────\n")
for (col in colnames(main_df)) {
cat(sprintf(" %-20s : %s\n", col, class(main_df[[col]])))
}
cat("\n── Missing Values per Kolom ────────────────────────\n")
mv <- colSums(is.na(main_df) | main_df == "" | main_df == "NA", na.rm=TRUE)
for (col in names(mv)) {
if (mv[col] > 0) cat(sprintf(" %-20s : %d missing\n", col, mv[col]))
}
cat("\n── Duplicate Rows ──────────────────────────────────\n")
cat(sprintf(" Jumlah duplikat : %d baris\n", sum(duplicated(main_df))))
## ── Dimensi Dataset ─────────────────────────────────
## Baris : 8000
## Kolom : 22
##
## ── Tipe Data per Kolom ─────────────────────────────
## order_id : character
## order_date : character
## ship_date : character
## platform : character
## category : character
## product_name : character
## unit_price : numeric
## quantity : character
## gross_sales : character
## campaign : character
## voucher_code : character
## discount_pct : character
## discount_value : character
## shipping_cost : character
## net_sales : numeric
## payment_method : character
## customer_segment : character
## region : character
## stock_status : character
## order_status : character
## customer_rating : numeric
## priority_flag : character
##
## ── Missing Values per Kolom ────────────────────────
## ship_date : 800 missing
## unit_price : 484 missing
## voucher_code : 196 missing
## discount_pct : 276 missing
## net_sales : 472 missing
## payment_method : 140 missing
## customer_rating : 1624 missing
## priority_flag : 752 missing
##
## ── Duplicate Rows ──────────────────────────────────
## Jumlah duplikat : 5172 baris
unit_price,
net_sales, customer_rating) sudah bertipe
numeric; sisanya character (perlu konversi lebih lanjut
untuk analisis).
customer_rating (1624 missing/20,3%) dan
ship_date (800 missing/10%).
payment_method dan
customer_rating.
net_sales (tidak logis untuk
transaksi penjualan).
df_clean <- main_df
# WAJIB LOOPING: bersihkan 3 kolom sekaligus
cols_to_trim <- c("product_name", "category", "region")
for (col in cols_to_trim) {
df_clean[[col]] <- trimws(as.character(df_clean[[col]]))
}
# Loop baris: cleaning IF/IF-ELSE
for (i in seq_len(nrow(df_clean))) {
# 1. Standardisasi Platform (LENGKAP)
plat_clean <- trimws(tolower(as.character(df_clean$platform[i])))
if (plat_clean %in% c("shopee")) {
df_clean$platform[i] <- "Shopee"
} else if (plat_clean %in% c("tokopedia","tokped")) {
df_clean$platform[i] <- "Tokopedia"
} else if (plat_clean == "tiktok shop") {
df_clean$platform[i] <- "TikTok Shop"
} else if (plat_clean %in% c("blibli")) {
df_clean$platform[i] <- "Blibli"
} else if (plat_clean %in% c("lazada")) {
df_clean$platform[i] <- "Lazada"
}
# Tambahkan else jika perlu (biarkan nilai asli jika tidak dikenali)
# 2. Cleaning nilai harga / net_sales (SAMA seperti sebelumnya)
clean_num <- function(val) {
v <- as.character(val)
if (grepl("Rp", v, ignore.case=TRUE)) {
v <- gsub("Rp","",v); v <- gsub("\\.","",v); v <- gsub(",",".",v)
}
n <- suppressWarnings(as.numeric(trimws(v)))
if (is.na(n) || n < 0) return(0)
return(n)
}
df_clean$unit_price[i] <- clean_num(df_clean$unit_price[i])
df_clean$net_sales[i] <- clean_num(df_clean$net_sales[i])
# 3. Missing Value — payment_method (WAJIB IF)
pm <- trimws(as.character(df_clean$payment_method[i]))
if (is.na(pm) || pm == "" || pm == "NA" || pm == "nan") {
df_clean$payment_method[i] <- "Unknown"
}
# 3. Missing Value — customer_rating (WAJIB IF)
cr <- suppressWarnings(as.numeric(df_clean$customer_rating[i]))
if (is.na(cr)) {
df_clean$customer_rating[i] <- 4.0
}
# 4. Standardisasi order_status (LENGKAP)
st_clean <- trimws(tolower(as.character(df_clean$order_status[i])))
if (st_clean %in% c("delivered","completed")) {
df_clean$order_status[i] <- "Completed"
} else if (st_clean %in% c("cancelled","cancel","batal")) {
df_clean$order_status[i] <- "Cancelled"
} else if (st_clean %in% c("on delivery","shipped","dikirim")) {
df_clean$order_status[i] <- "Shipped"
} else if (st_clean %in% c("retur","returned","return")) {
df_clean$order_status[i] <- "Returned"
}
}
if(exists("df_clean")){
datatable(head(df_clean, 100),
options = list(scrollX = TRUE, pageLength = 5),
caption = "Data yang sudah dibersihkan (Top 100)")
}
payment_method → “Unknown”, customer_rating →
4.0 (median).
“delivered”/“completed” → “Completed”
“cancelled”/“cancel”/“batal” → “Cancelled”
“on delivery”/“shipped”/“dikirim” → “Shipped”
“retur”/“returned”/“return” → “Returned”
Nilai status lain (misal "Pending", "Processing") dibiarkan apa adanya.
product_name, category, region)
sekaligus.
df_clean$is_high_value <- NA_character_
df_clean$order_priority <- NA_character_
df_clean$valid_transaction <- NA_character_
for (i in seq_len(nrow(df_clean))) {
sales <- as.numeric(df_clean$net_sales[i])
# 1. is_high_value
if (sales > 1000000) {
df_clean$is_high_value[i] <- "Yes"
} else {
df_clean$is_high_value[i] <- "No"
}
# 2. order_priority — WAJIB NESTED IF
if (sales > 1000000) {
df_clean$order_priority[i] <- "High"
} else {
if (sales >= 500000) {
df_clean$order_priority[i] <- "Medium"
} else {
df_clean$order_priority[i] <- "Low"
}
}
# 3. valid_transaction
if (df_clean$order_status[i] == "Cancelled") {
df_clean$valid_transaction[i] <- "Invalid"
} else {
df_clean$valid_transaction[i] <- "Valid"
}
}
cat("── Distribusi Kolom Baru ───────────────────────────\n")
cat("\nis_high_value:\n"); print(table(df_clean$is_high_value))
cat("\norder_priority:\n"); print(table(df_clean$order_priority))
cat("\nvalid_transaction:\n"); print(table(df_clean$valid_transaction))
cat("\n── Preview 5 baris ─────────────────────────────────\n")
print(head(df_clean[, c("platform","net_sales","is_high_value","order_priority","valid_transaction")], 5))
## ── Distribusi Kolom Baru ───────────────────────────
##
## is_high_value:
##
## No Yes
## 5096 2904
##
## order_priority:
##
## High Low Medium
## 2904 3508 1588
##
## valid_transaction:
##
## Invalid Valid
## 608 7392
##
## ── Preview 5 baris ─────────────────────────────────
## platform net_sales is_high_value order_priority valid_transaction
## 1 Tokopedia 680058 No Medium Valid
## 2 TikTok Shop 1476873 Yes High Valid
## 3 Tokopedia 369715 No Low Valid
## 4 Tokopedia 1382570 Yes High Valid
## 5 Tokopedia 376696 No Low Valid
# --- 1. Donut Chart: High Value ---
# Siapkan data (WAJIB agar tidak error object not found)
df_high <- as.data.frame(table(df_clean$is_high_value))
colnames(df_high) <- c("Status", "Count")
plot_ly(df_high, labels = ~Status, values = ~Count, type = 'pie', hole = 0.6,
textinfo = 'label+percent',
marker = list(colors = c('#FDB862', '#8B1D3D')),
height = 300) %>%
layout(title = list(text = "<b>High Value</b>", font = list(size = 14)),
showlegend = TRUE, margin = list(t = 50, b = 20, l = 20, r = 20))
# --- 2. Donut Chart: Order Priority ---
# Siapkan data
df_prior <- as.data.frame(table(df_clean$order_priority))
colnames(df_prior) <- c("Priority", "Count")
plot_ly(df_prior, labels = ~Priority, values = ~Count, type = 'pie', hole = 0.6,
textinfo = 'label+percent',
marker = list(colors = c('#FDB862', '#FF8C00', '#8B1D3D', '#4A148C')),
height = 300) %>%
layout(title = list(text = "<b>Order Priority</b>", font = list(size = 14)),
showlegend = TRUE, margin = list(t = 50, b = 20, l = 20, r = 20))
# --- 3. Donut Chart: Valid Transaction ---
# Siapkan data
df_valid <- as.data.frame(table(df_clean$valid_transaction))
colnames(df_valid) <- c("Validity", "Count")
plot_ly(df_valid, labels = ~Validity, values = ~Count, type = 'pie', hole = 0.6,
textinfo = 'label+percent',
marker = list(colors = c('#2ECC71', '#E74C3C')),
height = 300) %>%
layout(title = list(text = "<b>Valid Transaction</b>", font = list(size = 14)),
showlegend = TRUE, margin = list(t = 50, b = 20, l = 20, r = 20))
net_sales >
1.000.000 diberi label “Yes” (2.904 transaksi, 36,3%), sisanya
5.096 transaksi (63,7%) “No”.
order_status = “Cancelled” (608 transaksi,
7,6%). Sisanya 7.392 transaksi (92,4%) “Valid”.
is_high_value (IF
sederhana). Membuat kolom order_priority dengan nested
IF (IF di dalam ELSE). Membuat kolom valid_transaction
berdasarkan status transaksi.
cat("── 1. Platform Paling Dominan ──────────────────────\n")
plat_count <- sort(table(df_clean$platform), decreasing=TRUE)
print(plat_count)
cat(sprintf(" → Platform dominan: %s (%d transaksi)\n",
names(plat_count)[1], plat_count[1]))
cat("\n── 2. Category Paling Sering Muncul ────────────────\n")
cat_count <- sort(table(df_clean$category), decreasing=TRUE)
print(cat_count)
cat(sprintf(" → Kategori terbanyak: %s (%d transaksi)\n",
names(cat_count)[1], cat_count[1]))
cat("\n── 3. Status Transaksi Terbanyak ────────────────────\n")
status_count <- sort(table(df_clean$order_status), decreasing=TRUE)
print(status_count)
cat(sprintf(" → Status terbanyak: %s (%d transaksi)\n",
names(status_count)[1], status_count[1]))
## ── 1. Platform Paling Dominan ──────────────────────
##
## Shopee TikTok Shop Blibli Lazada Tokopedia
## 1664 1624 1616 1552 1544
## → Platform dominan: Shopee (1664 transaksi)
##
## ── 2. Category Paling Sering Muncul ────────────────
##
## Sports Fashion Beauty Home Living Electronics beauty
## 1500 1464 1368 1368 1312 152
## FASHION home living Home_Living sports electronics fashion
## 124 120 112 112 96 92
## ELECTRONICS SPORTS HOME LIVING BEAUTY home_living HOME_LIVING
## 48 44 40 32 8 8
## → Kategori terbanyak: Sports (1500 transaksi)
##
## ── 3. Status Transaksi Terbanyak ────────────────────
##
## Completed Cancelled Shipped Returned
## 6320 608 600 472
## → Status terbanyak: Completed (6320 transaksi)
# 1. Buat datanya dulu (WAJIB ADA agar tidak error 'object not found')
plat_df <- as.data.frame(table(df_clean$platform))
colnames(plat_df) <- c("Platform", "Jumlah")
plat_df <- plat_df[order(-plat_df$Jumlah), ]
# 2. Baru buat visualisasinya
plot_ly(plat_df, x = ~reorder(Platform, -Jumlah), y = ~Jumlah, type = 'bar',
marker = list(color = '#FDB862'),
text = ~paste("Jumlah:", Jumlah), hoverinfo = 'text+x',
height = 300) %>%
layout(title = list(text = "<b>Platform Paling Dominan</b>", font = list(size = 14)),
xaxis = list(title = ""),
yaxis = list(title = "Jumlah"),
margin = list(t = 50, b = 50))
# 1. Buat datanya dulu
cat_df <- as.data.frame(table(df_clean$category))
colnames(cat_df) <- c("Category", "Jumlah")
cat_df <- cat_df[order(-cat_df$Jumlah), ]
# 2. Baru buat visualisasinya
plot_ly(cat_df, x = ~reorder(Category, -Jumlah), y = ~Jumlah, type = 'bar',
marker = list(color = '#8B1D3D'),
text = ~paste("Jumlah:", Jumlah), hoverinfo = 'text+x',
height = 300) %>%
layout(title = list(text = "<b>Kategori Paling Sering Muncul</b>", font = list(size = 14)),
xaxis = list(title = ""),
yaxis = list(title = "Jumlah"),
margin = list(t = 50, b = 50))
# 1. Buat datanya dulu
stat_df <- as.data.frame(table(df_clean$order_status))
colnames(stat_df) <- c("Status", "Jumlah")
stat_df <- stat_df[order(-stat_df$Jumlah), ]
# 2. Baru buat visualisasinya
plot_ly(stat_df, x = ~reorder(Status, -Jumlah), y = ~Jumlah, type = 'bar',
marker = list(color = '#4A148C'),
text = ~paste("Jumlah:", Jumlah), hoverinfo = 'text+x',
height = 300) %>%
layout(title = list(text = "<b>Status Transaksi Terbanyak</b>", font = list(size = 14)),
xaxis = list(title = ""),
yaxis = list(title = "Jumlah"),
margin = list(t = 50, b = 50))
Menggunakan rvest untuk scraping halaman statis. Data
diambil: country name, capital, population.
url_countries <- "https://www.scrapethissite.com/pages/simple/"
page_countries <- read_html(url_countries)
country_name <- page_countries %>%
html_nodes(".country-name") %>%
html_text(trim = TRUE)
capital <- page_countries %>%
html_nodes(".country-capital") %>%
html_text(trim = TRUE)
population <- page_countries %>%
html_nodes(".country-population") %>%
html_text(trim = TRUE)
# Pastikan panjang sama
min_len <- min(length(country_name), length(capital), length(population))
countries_df <- data.frame(
country_name = country_name[1:min_len],
capital = capital[1:min_len],
population = population[1:min_len],
stringsAsFactors = FALSE
)
# data_status dengan IF
countries_df$data_status <- ifelse(
countries_df$country_name == "" | is.na(countries_df$country_name),
"Incomplete", "Complete"
)
cat(sprintf("Jumlah data diambil: %d negara\n", nrow(countries_df)))
print(head(countries_df, 8))
write.csv(countries_df, "countries.csv", row.names = FALSE)
## Jumlah data diambil: 250 negara
## country_name capital population data_status
## 1 Andorra Andorra la Vella 84000 Complete
## 2 United Arab Emirates Abu Dhabi 4975593 Complete
## 3 Afghanistan Kabul 29121286 Complete
## 4 Antigua and Barbuda St. John's 86754 Complete
## 5 Anguilla The Valley 13254 Complete
## 6 Albania Tirana 2986952 Complete
## 7 Armenia Yerevan 2968000 Complete
## 8 Angola Luanda 13068161 Complete
Menggunakan looping pagination + form query parameter. Wajib handle multi-page dan search form.
hockey_df <- data.frame()
search_q <- "B" # handle form/query
for (p in 1:3) { # looping pagination
url_h <- paste0(
"https://www.scrapethissite.com/pages/forms/?page_num=", p,
"&q=", search_q
)
pg <- read_html(url_h)
names_h <- pg %>% html_nodes(".name") %>% html_text(trim=TRUE)
years_h <- pg %>% html_nodes(".year") %>% html_text(trim=TRUE)
pts_h <- pg %>% html_nodes(".pct") %>% html_text(trim=TRUE)
wins_h <- pg %>% html_nodes(".wins") %>% html_text(trim=TRUE)
if (length(names_h) > 0) {
min_l <- min(length(names_h), length(years_h), length(pts_h), length(wins_h))
pg_df <- data.frame(
team_name = names_h[1:min_l],
year = years_h[1:min_l],
wins = wins_h[1:min_l],
points = pts_h[1:min_l],
page = p,
stringsAsFactors = FALSE
)
hockey_df <- rbind(hockey_df, pg_df)
}
Sys.sleep(0.5)
}
# data_status dengan IF
hockey_df$data_status <- ifelse(
hockey_df$team_name == "" | is.na(hockey_df$team_name),
"Incomplete", "Complete"
)
cat(sprintf("Jumlah data diambil: %d tim hockey\n", nrow(hockey_df)))
print(head(hockey_df, 8))
write.csv(hockey_df, "hockey_teams.csv", row.names = FALSE)
## Jumlah data diambil: 75 tim hockey
## team_name year wins points page data_status
## 1 Boston Bruins 1990 44 0.55 1 Complete
## 2 Buffalo Sabres 1990 31 0.388 1 Complete
## 3 Chicago Blackhawks 1990 49 0.613 1 Complete
## 4 Pittsburgh Penguins 1990 41 0.512 1 Complete
## 5 Quebec Nordiques 1990 16 0.2 1 Complete
## 6 St. Louis Blues 1990 47 0.588 1 Complete
## 7 Boston Bruins 1991 36 0.45 1 Complete
## 8 Buffalo Sabres 1991 31 0.388 1 Complete
Konten Oscar diload via AJAX. Pendekatan: akses endpoint JSON langsung dari network inspection.
# ── Oscar Winning Films (AJAX) dengan endpoint JSON ──────────
oscar_df <- data.frame()
years <- 2010:2015 # bisa diperlebar 2000-2015
for (year in years) {
url <- sprintf("https://www.scrapethissite.com/pages/ajax-javascript/?ajax=true&year=%d", year)
resp <- GET(url, user_agent("Mozilla/5.0"))
if (status_code(resp) == 200) {
data <- fromJSON(content(resp, as = "text", encoding = "UTF-8"))
if (is.data.frame(data) && nrow(data) > 0) {
data$year <- year
oscar_df <- rbind(oscar_df, data)
}
} else {
cat(sprintf("Gagal untuk tahun %d (HTTP %d)\n", year, status_code(resp)))
}
Sys.sleep(0.5)
}
if (nrow(oscar_df) == 0) {
warning("Tidak ada data Oscar yang diambil. Periksa koneksi.")
} else {
# Standardisasi nama kolom
names(oscar_df) <- tolower(names(oscar_df))
if(!"category" %in% names(oscar_df)) oscar_df$category <- "Best Picture"
# data_status dengan IF
oscar_df$data_status <- "Complete"
for (i in 1:nrow(oscar_df)) {
if (is.na(oscar_df$title[i]) || oscar_df$title[i] == "") oscar_df$data_status[i] <- "Incomplete"
else if (is.na(oscar_df$year[i])) oscar_df$data_status[i] <- "Incomplete"
}
cat(sprintf("Jumlah data diambil: %d film Oscar\n", nrow(oscar_df)))
print(head(oscar_df, 6))
write.csv(oscar_df, "oscar_films.csv", row.names = FALSE)
}
## Jumlah data diambil: 87 film Oscar
## title year awards nominations best_picture category
## 1 The King's Speech 2010 4 12 TRUE Best Picture
## 2 Inception 2010 4 8 NA Best Picture
## 3 The Social Network 2010 3 8 NA Best Picture
## 4 The Fighter 2010 2 7 NA Best Picture
## 5 Toy Story 3 2010 2 5 NA Best Picture
## 6 Alice in Wonderland 2010 2 3 NA Best Picture
## data_status
## 1 Complete
## 2 Complete
## 3 Complete
## 4 Complete
## 5 Complete
## 6 Complete
Data tersimpan dalam iframe. Pendekatan: akses langsung URL sumber
iframe dengan parameter frame=i.
# Akses langsung iframe source URL
url_turtle <- "https://www.scrapethissite.com/pages/frames/?frame=i"
pg_turtle <- read_html(url_turtle)
family <- pg_turtle %>% html_nodes("h3.family-name") %>% html_text(trim=TRUE)
desc_t <- pg_turtle %>% html_nodes(".description") %>% html_text(trim=TRUE)
more_t <- pg_turtle %>% html_nodes(".lead") %>% html_text(trim=TRUE)
# Pastikan panjang konsisten
n_fam <- length(family)
if (length(desc_t) == 0) desc_t <- rep("No description", n_fam)
if (length(more_t) == 0) more_t <- rep("No additional info", n_fam)
# Samakan jika ada perbedaan jumlah baris
if (length(desc_t) < n_fam) desc_t <- c(desc_t, rep("No description", n_fam - length(desc_t)))
if (length(more_t) < n_fam) more_t <- c(more_t, rep("No additional info", n_fam - length(more_t)))
turtle_df <- data.frame(
family_name = family,
description = desc_t,
additional_info = more_t,
stringsAsFactors = FALSE
)
turtle_df$data_status <- ifelse(
turtle_df$family_name == "" | is.na(turtle_df$family_name),
"Incomplete", "Complete"
)
cat(sprintf("Jumlah data diambil: %d turtle family\n", nrow(turtle_df)))
print(head(turtle_df, 5))
write.csv(turtle_df, "turtles.csv", row.names = FALSE)
## Jumlah data diambil: 14 turtle family
## family_name description additional_info data_status
## 1 Carettochelyidae No description No additional info Complete
## 2 Cheloniidae No description No additional info Complete
## 3 Chelydridae No description No additional info Complete
## 4 Dermatemydidae No description No additional info Complete
## 5 Dermochelyidae No description No additional info Complete
rvest dengan selector CSS sederhana. Tidak ada
missing value.
page_num dan query q=“B”. Berhasil
mengambil 75 tim (nama, tahun, menang, poin). Status semua
Complete. Menunjukkan kemampuan menangani pagination dan form.
?ajax=true&year=…) hasil inspeksi network. Berhasil
mengambil 87 film (2010-2015) dengan data judul, tahun, nominasi,
penghargaan. Status Complete. Membuktikan penanganan konten
dinamis tanpa Selenium.
?frame=i). Berhasil mengambil 14 keluarga penyu
(nama, deskripsi, info tambahan). Meskipun deskripsi banyak “No
description”, status tetap Complete karena nama keluarga ada.
Teknik mengatasi isolasi iframe.
data_status dengan IF.
# ── MP2 Section B: Data Handling untuk setiap hasil scraping ──
datasets <- list(
Countries = countries_df,
Hockey = hockey_df,
Oscar = oscar_df,
Turtles = turtle_df
)
for (name in names(datasets)) {
d <- datasets[[name]]
cat(sprintf("\n══ %s ═══════════════════════════════════════\n", name))
cat(sprintf(" Baris : %d | Kolom : %d\n", nrow(d), ncol(d)))
cat(" Kolom :", paste(colnames(d), collapse=", "), "\n")
cat(" Tipe data:\n")
for (col in colnames(d)) {
cat(sprintf(" %-20s : %s\n", col, class(d[[col]])))
}
mv_count <- sum(is.na(d) | d == "" | d == "NA", na.rm=TRUE)
dup_count <- sum(duplicated(d))
cat(sprintf(" Missing : %d | Duplikat: %d\n", mv_count, dup_count))
}
##
## ══ Countries ═══════════════════════════════════════
## Baris : 250 | Kolom : 4
## Kolom : country_name, capital, population, data_status
## Tipe data:
## country_name : character
## capital : character
## population : character
## data_status : character
## Missing : 0 | Duplikat: 0
##
## ══ Hockey ═══════════════════════════════════════
## Baris : 75 | Kolom : 6
## Kolom : team_name, year, wins, points, page, data_status
## Tipe data:
## team_name : character
## year : character
## wins : character
## points : character
## page : integer
## data_status : character
## Missing : 0 | Duplikat: 0
##
## ══ Oscar ═══════════════════════════════════════
## Baris : 87 | Kolom : 7
## Kolom : title, year, awards, nominations, best_picture, category, data_status
## Tipe data:
## title : character
## year : integer
## awards : integer
## nominations : integer
## best_picture : logical
## category : character
## data_status : character
## Missing : 81 | Duplikat: 0
##
## ══ Turtles ═══════════════════════════════════════
## Baris : 14 | Kolom : 4
## Kolom : family_name, description, additional_info, data_status
## Tipe data:
## family_name : character
## description : character
## additional_info : character
## data_status : character
## Missing : 0 | Duplikat: 0
# ── 1. Clean Countries ────────────────────────────────────────
for (i in seq_len(nrow(countries_df))) {
# Trim & proper case
countries_df$country_name[i] <- trimws(countries_df$country_name[i])
countries_df$capital[i] <- trimws(countries_df$capital[i])
# IF: handle missing capital
if (is.na(countries_df$capital[i]) || countries_df$capital[i] == "") {
countries_df$capital[i] <- "Unknown"
}
# Konversi population ke numerik
pop <- suppressWarnings(as.numeric(gsub(",","",countries_df$population[i])))
if (is.na(pop)) {
countries_df$population[i] <- "0"
}
}
countries_df <- unique(countries_df)
# ── 2. Clean Hockey ───────────────────────────────────────────
for (i in seq_len(nrow(hockey_df))) {
hockey_df$team_name[i] <- tools::toTitleCase(
tolower(trimws(hockey_df$team_name[i]))
)
yr <- suppressWarnings(as.integer(hockey_df$year[i]))
if (is.na(yr)) {
hockey_df$year[i] <- NA
} else {
hockey_df$year[i] <- as.character(yr)
}
}
hockey_df <- unique(hockey_df)
# ── 3. Clean Turtles ─────────────────────────────────────────
for (i in seq_len(nrow(turtle_df))) {
turtle_df$family_name[i] <- tools::toTitleCase(
tolower(trimws(turtle_df$family_name[i]))
)
desc <- as.character(turtle_df$description[i])
if (!is.na(desc)) {
turtle_df$description[i] <- trimws(gsub("\\s+", " ", desc))
}
if (is.na(turtle_df$additional_info[i]) ||
turtle_df$additional_info[i] == "") {
turtle_df$additional_info[i] <- "No additional info"
}
}
turtle_df <- unique(turtle_df)
# ── 4. Clean Oscar ────────────────────────────────────────────
for (col in c("title","category")) {
oscar_df[[col]] <- trimws(as.character(oscar_df[[col]]))
}
oscar_df <- unique(oscar_df)
cat("Cleaning selesai.\n")
cat(sprintf("Countries : %d baris\n", nrow(countries_df)))
cat(sprintf("Hockey : %d baris\n", nrow(hockey_df)))
cat(sprintf("Oscar : %d baris\n", nrow(oscar_df)))
cat(sprintf("Turtles : %d baris\n", nrow(turtle_df)))
# Simpan ulang CSV yang sudah bersih
write.csv(countries_df, "countries.csv", row.names=FALSE)
write.csv(hockey_df, "hockey_teams.csv", row.names=FALSE)
write.csv(oscar_df, "oscar_films.csv", row.names=FALSE)
write.csv(turtle_df, "turtles.csv", row.names=FALSE)
cat("\nSemua CSV berhasil disimpan.\n")
## Cleaning selesai.
## Countries : 250 baris
## Hockey : 75 baris
## Oscar : 87 baris
## Turtles : 14 baris
##
## Semua CSV berhasil disimpan.
Countries: 250 baris. Dilakukan trimming spasi, penanganan missing capital (diisi “Unknown”), konversi population ke numerik (gagal jadi 0). Tidak ada perubahan jumlah baris.
Hockey: 75 baris. Dilakukan trimming spasi, konversi nama tim ke proper case, konversi year ke integer (NA jika gagal). Tidak ada perubahan jumlah baris.
Oscar: 87 baris. Dilakukan trimming spasi pada kolom title dan category, serta penghapusan duplikat (tidak ada). Jumlah baris tetap.
Turtles: 14 baris. Dilakukan trimming spasi, konversi keluarga ke proper case, penghapusan whitespace berlebih pada deskripsi, pengisian additional info kosong dengan “No additional info”. Jumlah baris tetap.
Kepatuhan soal: Menggunakan looping & IF untuk membersihkan setiap dataset (trim, konversi tipe, handle missing, hapus duplikat). Semua CSV berhasil disimpan ulang.
# Fungsi assign_status dengan 3 kondisi
assign_status <- function(df, key_col) {
df$data_status <- NA_character_
for (i in seq_len(nrow(df))) {
val <- as.character(df[[key_col]][i])
if (is.na(val) || val == "" || val == "Unknown" || val == "No description") {
df$data_status[i] <- "Incomplete"
} else if (nchar(val) < 3) {
df$data_status[i] <- "Incomplete"
} else {
df$data_status[i] <- "Complete"
}
}
return(df)
}
# Terapkan ke semua dataset
countries_df <- assign_status(countries_df, "country_name")
hockey_df <- assign_status(hockey_df, "team_name")
oscar_df <- assign_status(oscar_df, "title")
turtle_df <- assign_status(turtle_df, "family_name")
cat("--- Ringkasan Status Data (Turtles) ---\n")
print(table(turtle_df$data_status))
cat("\n--- Preview 5 baris Turtles ---\n")
knitr::kable(head(turtle_df, 5))
## --- Ringkasan Status Data (Turtles) ---
##
## Complete
## 14
##
## --- Preview 5 baris Turtles ---
| family_name | description | additional_info | data_status |
|---|---|---|---|
| Carettochelyidae | No description | No additional info | Complete |
| Cheloniidae | No description | No additional info | Complete |
| Chelydridae | No description | No additional info | Complete |
| Dermatemydidae | No description | No additional info | Complete |
| Dermochelyidae | No description | No additional info | Complete |
Implementasi kolom data_status: Fungsi assign_status
dengan looping dan IF/IF-ELSE diterapkan pada keempat dataset scraping.
Tiga kondisi yang digunakan:
(1) Nilai kolom kunci kosong, NA,
“Unknown”, atau “No description” → status “Incomplete”.
(2) Panjang
karakter kurang dari 3 → status “Incomplete”.
(3) Selain itu →
status “Complete”.
Hasil pada Turtles (contoh): Seluruh 14 family berstatus
“Complete” karena kolom family_name terisi lengkap (minimal
3 karakter). Meskipun deskripsi berisi “No description”, kondisi (1)
tidak terpicu karena kondisi tersebut hanya memeriksa kolom kunci
(family_name), bukan kolom deskripsi.
Kepatuhan soal: Minimal 3 kondisi conditional logic terpenuhi
(elemen tidak ditemukan → “default”, data tidak lengkap → “Incomplete”,
data valid → “Complete”). Kolom data_status berhasil
ditambahkan pada setiap dataset scraping.
1. Website paling mudah di-scrape: Countries of the World — halaman statis HTML sederhana, semua data langsung tersedia dalam tag HTML tanpa rendering JavaScript atau autentikasi.
2. Website paling sulit: Oscar Winning Films — menggunakan AJAX/JavaScript untuk memuat konten secara dinamis. Data tidak tersedia dalam source HTML awal sehingga memerlukan inspeksi network atau penggunaan Selenium.
3. Perbedaan Pendekatan:
read_html() + CSS selector. Paling sederhana dan cepat.
Minimal 2 Rekomendasi: - Gunakan caching atau rate limiting saat scraping multi-page untuk menghindari pemblokiran IP dari server target. - Kombinasikan rvest (R) dengan Selenium (Python) untuk menangani website hybrid antara static content dan dynamic AJAX — manfaatkan kelebihan masing-masing bahasa.