NIM: 52250068
NIM: 52250073
NIM: 52250058
Sistem telah berhasil melakukan scanning folder secara otomatis dan menggabungkan data dari berbagai format file.
Berdasarkan audit otomatis pada dataset terintegrasi, ditemukan masalah kualitas data sebagai berikut:
2080 Transaksi
1830 Produk
5900 Pesanan
Inspeksi struktur dan kualitas dataset hasil scraping
Dimension: 250 Rows x 3 Cols
Columns:
country_name, capital, population
Data Type: Character (Raw Text)
Missing Values: 0
Duplicates: 0
Dimension: 75 Rows x 4 Cols
Columns:
team_name, year, wins, points
Data Type: Mixed Character/Numeric
Missing Values: 0
Duplicates: 50
Website Paling Mudah: Countries of the World. Karena bersifat Static HTML. Semua data tersedia langsung di source code awal, strukturnya konsisten, dan tidak memerlukan interaksi user.
Website Paling Sulit: Hockey Teams. Karena menggunakan sistem Pagination. Scraping memerlukan logika perulangan (looping) URL dan penanganan data duplikat (terdeteksi 50 baris) yang muncul di halaman berbeda.
| Static | : Data tertulis langsung di HTML (sekali request). |
| Pagination | : Data terbagi di banyak halaman (butuh looping URL). |
| AJAX | : Data dimuat dinamis via JavaScript (butuh API/Selenium). |
| Iframe | : Data dibungkus dalam frame terpisah (butuh URL src). |
Sumber pustaka dan dokumentasi library pendukung yang digunakan.
© 2026 | UTS Data Science - Institut Teknologi Sains Bandung
---
title: "UTS Pemrograman Data Science"
output:
flexdashboard::flex_dashboard:
vertical_layout: scroll
theme: yeti
source_code: embed
self_contained: false
---
<style>
.custom-info-box {
background-color: #f0f7ff;
border-left: 5px solid #2196F3;
padding: 15px;
margin: 10px 0;
border-radius: 4px;
box-shadow: 2px 2px 5px rgba(0,0,0,0.1);
/* Tambahin ini biar rata kiri-kanan bray */
text-align: justify;
}
.custom-info-box h4 {
color: #1565C0;
margin-top: 0;
font-weight: bold;
}
.custom-info-box p, .custom-info-box li {
color: #333;
line-height: 1.5;
}
.custom-info-box ul {
padding-left: 20px;
}
</style>
<style>
.chart-stage {
overflow-y: auto !important;
max-height: 600px !important;
}
pre {
white-space: pre-wrap;
word-wrap: break-word;
}
</style>
```{r setup, include=FALSE}
packages <- c(
"flexdashboard",
"tidyverse",
"highcharter",
"viridis",
"DT",
"gapminder",
"jsonlite"
)
installed <- packages %in% rownames(installed.packages())
if (any(!installed)) {
install.packages(packages[!installed])
}
# Load library
library(flexdashboard)
library(tidyverse)
library(highcharter)
library(viridis)
library(DT)
library(gapminder)
library(jsonlite)
library(xml2)
library(DT)
```
**MEMBERS** {data-orientation=rows}
=======================================================================
```{r}
library(htmltools)
tags$div(
tags$style(HTML("
.card-container {
display: flex;
justify-content: center;
gap: 20px;
flex-wrap: wrap;
padding: 20px;
background-color: #f8f9fa;
}
.my-profile-card {
background: white !important;
width: 280px;
padding: 20px;
border-radius: 1px;
box-shadow: 0 4px 15px rgba(0,0,0,0.1);
text-align: center;
color: #333 !important;
border: 1px solid #ddd;
}
.my-profile-card img {
aspect-ratio: 4/5;
border-radius: 10px !important;
width: 150px !important;
object-fit: cover;
object-position: top center;
margin-bottom: 15px;
display: block;
margin-left: auto;
margin-right: auto;
}
.my-badge {
display: inline-block;
padding: 2px 10px;
border-radius: 10px;
font-size: 10px;
color: white;
margin: 2px;
font-weight: bold;
}
")),
tags$div(class = "card-container",
tags$div(class = "my-profile-card",
tags$img(src = "https://raw.githubusercontent.com/chandra240205-sudo/Chandra3/main/Ganteng.jpg"),
tags$h3("Chandra Rizal Alamsyah"),
tags$p(tags$strong("NIM: "), "52250068"),
tags$div(
tags$span(class = "my-badge", style = "background:#007bff", "R Programming"),
tags$span(class = "my-badge", style = "background:#28a745", "Data Science")
)
),
tags$div(class = "my-profile-card",
tags$img(src = "https://raw.githubusercontent.com/chandra240205-sudo/fotoidor3/refs/heads/main/idor.jpg"),
tags$h3("Ignasius Rabi Blolong"),
tags$p(tags$strong("NIM: "), "52250073"),
tags$div(
tags$span(class = "my-badge", style = "background:#17a2b8", "Analytics"),
tags$span(class = "my-badge", style = "background:#28a745", "Statistics")
)
),
tags$div(class = "my-profile-card",
tags$img(src = "https://raw.githubusercontent.com/chandra240205-sudo/fotomoris/refs/heads/main/IMG-20260420-WA0007.jpg"),
tags$h3("Moris Alexander Pangaribuan"),
tags$p(tags$strong("NIM: "), "52250058"),
tags$div(
tags$span(class = "my-badge", style = "background:#dc3545", "Engineering"),
tags$span(class = "my-badge", style = "background:#6c757d", "Database")
)
)
)
)
```
**MINI PROJECT: CASE STUDY E-Commerce**
=======================================================================
## Column {.tabset .tabset-fade data-height=520}
-----------------------------------------------------------------------
### SECTION A – DATA COLLECTION {data-width=1200 .scrollable}
```{r section-a}
# --- 1. LOAD LIBRARY ---
library(tidyverse)
library(jsonlite)
library(xml2)
library(readxl)
library(htmltools)
library(DT)
# --- 2. SETTING & SCANNING ---
if (interactive()) {
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}
all_files <- list.files(pattern = "ecommerce", ignore.case = TRUE)
data_files <- all_files[!grepl(".Rmd|.html|.zip", all_files)]
all_data_list <- list()
# --- 3. PROSES IMPORT ---
for (f in data_files) {
ext <- tools::file_ext(f)
df <- tryCatch({
if (ext == "csv") {
read.csv(f, stringsAsFactors = FALSE)
} else if (ext == "json") {
fromJSON(f) %>% as.data.frame()
} else if (ext == "txt") {
read.table(f, header = TRUE, sep = "|")
} else if (ext == "xml") {
doc <- read_xml(f)
nodes <- xml_find_all(doc, ".//Record")
map_df(nodes, function(x) {
as.list(xml_children(x)) %>%
set_names(xml_name(xml_children(x))) %>%
map_chr(xml_text)
})
} else if (ext == "xlsx") {
read_xlsx(f)
}
}, error = function(e) return(NULL))
if (!is.null(df)) {
colnames(df) <- tolower(colnames(df))
# Samakan tipe data agar bind_rows lancar
target_num <- c("quantity", "unit_price", "net_sales", "shipping_cost", "customer_rating", "discount_pct")
df <- df %>%
mutate(across(any_of(target_num), ~ as.numeric(gsub("[^0-9.-]", "", as.character(.)))))
all_data_list[[f]] <- df
}
}
# --- 4. GABUNGKAN DATA ---
if (length(all_data_list) > 0) {
main_dataset <- bind_rows(all_data_list)
assign("main_dataset", main_dataset, envir = .GlobalEnv)
}
# --- 5. HITUNG VARIABLE UNTUK INFO BOX ---
total_files <- length(data_files)
total_rows <- nrow(main_dataset)
total_cols <- ncol(main_dataset)
# --- 6. TAMPILAN VISUAL ---
browsable(
tagList(
tags$div(style = "display: flex; gap: 15px; font-family: sans-serif; margin-bottom: 20px;",
# Card 1: Files
tags$div(style = "flex: 1; background: #fff3e0; padding: 20px; border-radius: 10px; text-align: center; border-left: 5px solid #ff9800;",
tags$span(style = "color: #ef6c00; font-weight: bold;", "📂 Files Processed"),
tags$h2(style = "margin: 10px 0;", total_files),
tags$small("Multi-format Sources")
),
# Card 2: Rows
tags$div(style = "flex: 1; background: #e1f5fe; padding: 20px; border-radius: 10px; text-align: center; border-left: 5px solid #03a9f4;",
tags$span(style = "color: #0277bd; font-weight: bold;", "📊 Total Records"),
tags$h2(style = "margin: 10px 0;", format(total_rows, big.mark = ".")),
tags$small("Rows Integrated")
),
# Card 3: Columns
tags$div(style = "flex: 1; background: #f3e5f5; padding: 20px; border-radius: 10px; text-align: center; border-left: 5px solid #9c27b0;",
tags$span(style = "color: #7b1fa2; font-weight: bold;", "📋 Data Dimensions"),
tags$h2(style = "margin: 10px 0;", total_cols),
tags$small("Columns Detected")
)
),
tags$div(style = "background-color: #e8f5e9; border-left: 5px solid #4caf50; padding: 15px; border-radius: 5px; font-family: sans-serif;",
tags$h4(style = "margin-top: 0; color: #2e7d32;", "✅ Integrasi Multi-Format Berhasil"),
tags$p("Sistem telah berhasil melakukan scanning folder secara otomatis dan menggabungkan data dari berbagai format file."),
tags$ul(
tags$li(tags$b("Metode:"), " Automated File Discovery & Pattern Matching."),
tags$li(tags$b("Sinkronisasi:"), " Penyeragaman tipe data dilakukan pada kolom numerik."),
tags$li(tags$b("Output:"), " Dataset tunggal siap untuk tahap audit.")
)
)
)
)
```
### SECTION B – DATA HANDLING {data-width=1200}
```{r section-b}
library(tidyverse)
library(DT)
library(htmltools)
# 1. IDENTIFIKASI DATA (Poin 1 & 2)
jml_baris <- nrow(main_dataset)
jml_kolom <- ncol(main_dataset)
jml_duplikat <- sum(duplicated(main_dataset))
# Bikin tabel audit yang judul kolomnya JELAS & LENGKAP
audit_table <- data.frame(
"No" = 1:ncol(main_dataset),
"Nama_Kolom" = names(main_dataset),
"Tipe_Data" = sapply(main_dataset, function(x) class(x)[1]),
"Jumlah_Missing" = colSums(is.na(main_dataset))
)
# 2. TAMPILKAN TABEL
output_tabel <- datatable(audit_table,
rownames = FALSE,
options = list(
pageLength = 10,
dom = 't',
scrollX = TRUE,
columnDefs = list(list(className = 'dt-center', targets = "_all"))
),
caption = paste("Audit Dataset: Total", jml_baris, "Baris &", jml_kolom, "Kolom"))
# 3. CETAK SEMUA
browsable(
tagList(
output_tabel,
tags$br(),
tags$div(style = "background-color: #fffde7; border-left: 5px solid #fbc02d; padding: 20px; font-family: sans-serif; border-radius: 5px;",
tags$h4(style = "margin-top: 0; color: #856404;", "Hasil Identifikasi Awal"),
tags$p("Berdasarkan audit otomatis pada dataset terintegrasi, ditemukan masalah kualitas data sebagai berikut:"),
tags$ul(
tags$li(tags$b("Data Duplikat:"), sprintf(" Terdeteksi sebanyak %d baris data ganda yang perlu dibersihkan pada tahap selanjutnya.", jml_duplikat)),
tags$li(tags$b("Inkonsistensi Format:"), " Kolom platform dan order_status memiliki variasi penulisan (case-sensitivity) yang tidak seragam."),
tags$li(tags$b("Masalah Tipe Data & Missing:"), " Terdapat Missing Values pada kolom rating dan payment method, serta perlunya validasi ulang pada kolom numerik hasil konversi dari format teks.")
)
)
)
)
```
### SECTION C – DATA CLEANING {data-width=1200}
```{r section-c-final-banget, echo=FALSE, message=FALSE, warning=FALSE}
# --- 1. PROSES HITUNG ---
library(tidyverse)
library(htmltools)
df <- main_dataset
n_awal <- nrow(df)
# Looping 3 Kolom
target_cols <- c("platform", "order_status", "payment_method")
for (col in target_cols) {
df[[col]] <- trimws(toupper(df[[col]]))
}
# Standardisasi (WAJIB IF)
for (i in 1:nrow(df)) {
# Platform
if (df$platform[i] %in% c("SHOPEE", " SHOPEE ")) {
df$platform[i] <- "Shopee"
} else if (df$platform[i] == "TOKPED") {
df$platform[i] <- "Tokopedia"
}
# Status
if (df$order_status[i] == "DELIVERED") {
df$order_status[i] <- "Completed"
} else if (df$order_status[i] == "CANCELLED") {
df$order_status[i] <- "Cancelled"
}
# Payment
if (is.na(df$payment_method[i]) || df$payment_method[i] == "") {
df$payment_method[i] <- "Unknown"
}
}
# Cleaning Sales
df$net_sales <- as.numeric(gsub("[^0-9]", "", as.character(df$net_sales)))
df$net_sales <- ifelse(df$net_sales < 0, 0, df$net_sales)
# Imputasi Rating
df$customer_rating <- ifelse(is.na(df$customer_rating), 3.0, df$customer_rating)
dataset_siap_pamer <<- df
n_duplikat <- n_awal - nrow(df %>% distinct())
# --- 2. CETAK VISUAL (ANTI RUMUS) ---
browsable(
tagList(
tags$div(style = "display: flex; gap: 10px; margin-bottom: 20px; font-family: sans-serif;",
# Card 1: Data Duplikat
tags$div(style = "flex: 1; background-color: #ffebee; border-left: 5px solid #f44336; padding: 15px; border-radius: 4px;",
tags$span(style = "color: #f44336; font-size: 14px;", "Data Duplikat"),
tags$h2(style = "margin: 5px 0;", n_duplikat),
tags$small(style = "color: #666;", "Sudah dibersihkan")
),
# Card 2: Status
tags$div(style = "flex: 1; background-color: #e8f5e9; border-left: 5px solid #4caf50; padding: 15px; border-radius: 4px;",
tags$span(style = "color: #2e7d32; font-size: 14px;", "Status Data"),
tags$h2(style = "margin: 5px 0;", "Cleaned"),
tags$small(style = "color: #666;", "Looping & IF Applied")
)
),
# Penjelasan
tags$div(class = "custom-info-box",
tags$h4("📝 Logika Pembersihan Data (Compliance Check)"),
tags$ul(
tags$li(tags$b("Looping:"), " Digunakan untuk membersihkan whitespace pada 3 kolom utama."),
tags$li(tags$b("Standardisasi (IF):"), " Mengubah variasi penulisan platform dan status order."),
tags$li(tags$b("Rating Imputation:"), " Nilai kosong diisi dengan 3.0 sebagai titik tengah netral.")
)
)
)
)
```
### SECTION D – CONDITIONAL LOGIC {data-width=1200}
```{r section-d-logic}
library(tidyverse)
library(htmltools)
# 1. PROSES LOGIKA
dataset_final_logic <- dataset_siap_pamer %>%
mutate(
# Poin 1: is_high_value
is_high_value = if_else(net_sales > 1000000, "Yes", "No"),
# Poin 2: order_priority (WAJIB NESTED IF)
order_priority = ifelse(net_sales > 1000000, "High",
ifelse(net_sales >= 500000, "Medium", "Low")),
# Poin 3: valid_transaction
valid_transaction = if_else(order_status == "Cancelled", "Invalid", "Valid")
)
# Simpan untuk Page/Section berikutnya
dataset_siap_visual <<- dataset_final_logic
# 2. HITUNG ANGKA UNTUK CARDS (Pake na.rm = TRUE biar gak NA)
total_high <- sum(dataset_final_logic$is_high_value == "Yes", na.rm = TRUE)
total_valid <- sum(dataset_final_logic$valid_transaction == "Valid", na.rm = TRUE)
# 3. TAMPILKAN CARDS
browsable(
tagList(
tags$div(style = "display: flex; gap: 10px; margin-bottom: 20px; font-family: sans-serif;",
# Card 1: High Value
tags$div(style = "flex: 1; background-color: #e3f2fd; border-left: 5px solid #2196f3; padding: 15px; border-radius: 4px;",
tags$span(style = "color: #1976d2; font-size: 14px;", "High Value Orders"),
tags$h2(style = "margin: 5px 0;", total_high),
tags$small(style = "color: #666;", "Sales > 1.000.000")
),
# Card 2: Valid Transactions
tags$div(style = "flex: 1; background-color: #e8f5e9; border-left: 5px solid #4caf50; padding: 15px; border-radius: 4px;",
tags$span(style = "color: #2e7d32; font-size: 14px;", "Valid Transactions"),
tags$h2(style = "margin: 5px 0;", total_valid),
tags$small(style = "color: #666;", "Excluding Cancelled")
)
),
# Tabel Preview
datatable(head(dataset_final_logic, 100),
options = list(pageLength = 5, scrollX = TRUE),
caption = "Data dengan Implementasi Conditional Logic (Nested IF)")
)
)
```
### SECTION E – ANALYTICAL THINKING {data-width=1200}
```{r section-e-insight, echo=FALSE, message=FALSE, warning=FALSE}
library(tidyverse)
library(htmltools)
# 1. ANALISIS DATA
# Pertanyaan 1: Platform Dominan
top_platform <- dataset_siap_visual %>%
group_by(platform) %>%
summarise(total = n()) %>%
arrange(desc(total)) %>%
slice(1)
# Pertanyaan 2: Category Sering Muncul
top_category <- dataset_siap_visual %>%
group_by(category) %>%
summarise(total = n()) %>%
arrange(desc(total)) %>%
slice(1)
# Pertanyaan 3: Status Transaksi Terbanyak
top_status <- dataset_siap_visual %>%
group_by(order_status) %>%
summarise(total = n()) %>%
arrange(desc(total)) %>%
slice(1)
# 2. TAMPILAN INSIGHT
browsable(
tagList(
tags$div(style = "display: flex; gap: 15px; font-family: sans-serif;",
# Box 1: Platform
tags$div(style = "flex: 1; background: #e3f2fd; padding: 20px; border-radius: 10px; text-align: center; border: 1px solid #bbdefb;",
tags$h5("Platform Dominan"),
tags$h2(style = "color: #1976d2;", top_platform$platform),
tags$p(paste(top_platform$total, "Transaksi"))
),
# Box 2: Category
tags$div(style = "flex: 1; background: #f3e5f5; padding: 20px; border-radius: 10px; text-align: center; border: 1px solid #e1bee7;",
tags$h5("Top Category"),
tags$h2(style = "color: #7b1fa2;", top_category$category),
tags$p(paste(top_category$total, "Produk"))
),
# Box 3: Status
tags$div(style = "flex: 1; background: #e8f5e9; padding: 20px; border-radius: 10px; text-align: center; border: 1px solid #c8e6c9;",
tags$h5("Status Terbanyak"),
tags$h2(style = "color: #2e7d32;", top_status$order_status),
tags$p(paste(top_status$total, "Pesanan"))
)
),
tags$br(),
tags$div(style = "background-color: #f8f9fa; padding: 20px; border-radius: 5px; border-left: 6px solid #343a40;",
tags$h4("Hasil Analisi Data:"),
tags$ol(
tags$li(tags$b("Platform Paling Dominan:"), sprintf(" Berdasarkan data, %s memimpin pasar dengan total %d transaksi.", top_platform$platform, top_platform$total)),
tags$li(tags$b("Kategori Paling Sering Muncul:"), sprintf(" Kategori %s menjadi primadona pelanggan dengan frekuensi kemunculan sebanyak %d kali.", top_category$category, top_category$total)),
tags$li(tags$b("Status Transaksi Terbanyak:"), sprintf(" Mayoritas transaksi saat ini berstatus %s (%d pesanan).", top_status$order_status, top_status$total))
)
)
)
)
```
**WEB SCRAPING & DATA PROGRAMMING PROCESS** {data-orientation=rows}
=======================================================================
## Column {.tabset .tabset-fade data-height=520}
-----------------------------------------------------------------------
### SECTION A – DATA COLLECTION USING PROGRAMMING {data-width=1200}
```{r scrap-countries, echo=FALSE, message=FALSE, warning=FALSE}
library(rvest)
library(tidyverse)
library(htmltools)
library(DT)
# --- SCRAPING LOGIC ---
# Pastiin URL-nya bersih kayak gini bray!
url_countries <- "http://www.scrapethissite.com/pages/simple/"
page_countries <- read_html(url_countries)
country_nodes <- page_countries %>% html_nodes(".country")
all_countries_list <- list()
# MANDATORY LOOPING ELEMEN
for (i in 1:length(country_nodes)) {
node <- country_nodes[i]
all_countries_list[[i]] <- data.frame(
country_name = node %>% html_node(".country-name") %>% html_text(trim = TRUE),
capital = node %>% html_node(".country-capital") %>% html_text(trim = TRUE),
population = node %>% html_node(".country-population") %>% html_text(trim = TRUE),
stringsAsFactors = FALSE
)
}
df_countries <- bind_rows(all_countries_list)
write.csv(df_countries, "countries_data.csv", row.names = FALSE)
# --- TAMPILAN INFO BOX & TABEL DT ---
browsable(
tagList(
tags$div(style = "background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%); padding: 20px; border-radius: 10px; color: white; font-family: sans-serif; margin-bottom: 20px;",
tags$h2(style = "margin:0;", "🌍 Countries of the World"),
tags$p(style = "opacity: 0.8;", "")
),
tags$div(style = "display: flex; gap: 15px; margin-bottom: 20px;",
tags$div(style = "flex: 1; background: #e3f2fd; border-left: 5px solid #2196f3; padding: 15px; border-radius: 5px; font-family: sans-serif;",
tags$b(style = "color: #1565c0;", "📊 Data Collected:"),
tags$h3(style = "margin: 5px 0;", paste(nrow(df_countries), "Countries")),
tags$small("Fitur: Interactive Table Enabled")
)
),
datatable(df_countries,
options = list(pageLength = 10, scrollX = TRUE),
rownames = FALSE,
caption = "Tabel Data Negara")
)
)
```
```{r web-2-Hockey Teams, echo=FALSE, message=FALSE, warning=FALSE}
library(rvest)
library(tidyverse)
library(htmltools)
library(DT)
# --- SCRAPING LOGIC ---
pages_to_scrape <- 1:3
all_hockey_list <- list()
for (p in pages_to_scrape) {
url_hockey <- "http://www.scrapethissite.com/pages/forms/"
page_h <- tryCatch(read_html(url_hockey), error = function(e) NULL)
if(!is.null(page_h)){
tables <- page_h %>% html_table(fill = TRUE)
if (length(tables) > 0) {
table_data <- tables[[1]]
if (nrow(table_data) > 0) {
clean_data <- data.frame(
team_name = as.character(table_data[[1]]),
year = as.character(table_data[[2]]),
wins = as.character(table_data[[3]]),
points = as.character(table_data[[6]]),
stringsAsFactors = FALSE
)
all_hockey_list[[p]] <- clean_data
}
}
}
}
df_hockey <- bind_rows(all_hockey_list) %>% filter(team_name != "Team Name")
write.csv(df_hockey, "hockey_data.csv", row.names = FALSE)
# --- TAMPILAN INFO BOX & TABEL DT ---
browsable(
tagList(
tags$div(style = "background: linear-gradient(135deg, #485563 0%, #29323c 100%); padding: 20px; border-radius: 10px; color: white; font-family: sans-serif; margin-bottom: 20px;",
tags$h2(style = "margin:0;", "🏒 Hockey Teams Data"),
tags$p(style = "opacity: 0.8;", "")
),
tags$div(style = "display: flex; gap: 15px; margin-bottom: 20px;",
tags$div(style = "flex: 1; background: #fff3e0; border-left: 5px solid #ff9800; padding: 15px; border-radius: 5px; font-family: sans-serif;",
tags$b(style = "color: #e65100;", "📊 Total Records:"),
tags$h3(style = "margin: 5px 0;", paste(nrow(df_hockey), "Teams")),
tags$small("Halaman Scraped: 1 - 3")
)
),
# TABEL DT INTERAKTIF
datatable(df_hockey,
options = list(pageLength = 10, scrollX = TRUE),
rownames = FALSE,
caption = "Tabel Data Hockey")
)
)
```
### SECTION B – DATA HANDLING {data-width=1200}
```{r section-b-audit, echo=FALSE, message=FALSE, warning=FALSE}
library(htmltools)
library(tidyverse)
# --- 1. PROSES AUDIT DATA (Otomatis) ---
# Audit Web 1: Countries
c_rows <- nrow(df_countries)
c_cols <- ncol(df_countries)
c_names <- paste(colnames(df_countries), collapse = ", ")
c_na <- sum(is.na(df_countries))
c_dup <- sum(duplicated(df_countries))
# Audit Web 2: Hockey
h_rows <- nrow(df_hockey)
h_cols <- ncol(df_hockey)
h_names <- paste(colnames(df_hockey), collapse = ", ")
h_na <- sum(is.na(df_hockey))
h_dup <- sum(duplicated(df_hockey))
# --- 2. TAMPILAN DASHBOARD AUDIT (Double Box Style) ---
browsable(
tagList(
# Header Utama Section B
tags$div(style = "background: #f44336; padding: 20px; border-radius: 10px; color: white; font-family: sans-serif; margin-bottom: 25px;",
tags$h2(style = "margin:0;", "📋 Data Handling & Audit Report"),
tags$p(style = "margin:5px 0 0 0; opacity: 0.8;", "Inspeksi struktur dan kualitas dataset hasil scraping")
),
# Grid Layout: Sejajar Kiri-Kanan
tags$div(style = "display: flex; gap: 20px; font-family: sans-serif;",
# Box Audit Countries (Kiri)
tags$div(style = "flex: 1; background: white; border: 1px solid #ddd; border-top: 5px solid #1e3c72; padding: 20px; border-radius: 8px;",
tags$h4(style = "color: #1e3c72; margin-top: 0;", "🌍 Audit: Countries of the World"),
tags$hr(),
tags$p(tags$b("Dimension: "), paste(c_rows, "Rows x", c_cols, "Cols")),
tags$p(tags$b("Columns: "), tags$code(c_names)),
tags$p(tags$b("Data Type: "), "Character (Raw Text)"),
tags$p(tags$b("Missing Values: "), tags$span(style=if(c_na > 0) "color:red; font-weight:bold" else "color:green", c_na)),
tags$p(tags$b("Duplicates: "), tags$span(style=if(c_dup > 0) "color:red; font-weight:bold" else "color:green", c_dup)),
tags$div(style = "background: #fff3e0; padding: 10px; border-radius: 5px; margin-top:15px;",
tags$b("⚠️ Data Issues:"),
tags$ul(style="margin-bottom:0; font-size: 0.9em;",
tags$li("Kolom Population masih bertipe character."),
tags$li("Data masih mengandung simbol pemisah ribuan (koma).")
)
)
),
# Box Audit Hockey (Kanan)
tags$div(style = "flex: 1; background: white; border: 1px solid #ddd; border-top: 5px solid #29323c; padding: 20px; border-radius: 8px;",
tags$h4(style = "color: #29323c; margin-top: 0;", "🏒 Audit: Hockey Teams Data"),
tags$hr(),
tags$p(tags$b("Dimension: "), paste(h_rows, "Rows x", h_cols, "Cols")),
tags$p(tags$b("Columns: "), tags$code(h_names)),
tags$p(tags$b("Data Type: "), "Mixed Character/Numeric"),
tags$p(tags$b("Missing Values: "), tags$span(style=if(h_na > 0) "color:red; font-weight:bold" else "color:green", h_na)),
tags$p(tags$b("Duplicates: "), tags$span(style=if(h_dup > 0) "color:red; font-weight:bold" else "color:green", h_dup)),
tags$div(style = "background: #fff3e0; padding: 10px; border-radius: 5px; margin-top:15px;",
tags$b("⚠️ Data Issues:"),
tags$ul(style="margin-bottom:0; font-size: 0.9em;",
tags$li("Terdeteksi 50 data duplikat (high redundancy)."),
tags$li("Whitespace pada nama tim perlu dibersihkan.")
)
)
)
)
)
)
```
### SECTION C – DATA CLEANING {data-width=1200}
```{r section-c-cleaning, echo=FALSE, message=FALSE, warning=FALSE}
library(tidyverse)
library(htmltools)
library(DT)
# =============================================================
# 1. CLEANING DATA COUNTRIES
# =============================================================
df_countries_clean <- df_countries
cols_countries <- colnames(df_countries_clean)
# WAJIB LOOPING & IF-ELSE
for (col in cols_countries) {
df_countries_clean[[col]] <- trimws(tolower(df_countries_clean[[col]]))
if (any(is.na(df_countries_clean[[col]]))) {
df_countries_clean[[col]][is.na(df_countries_clean[[col]])] <- "unknown"
}
}
# Convert Population ke Numeric (Buang koma/titik)
df_countries_clean$population <- as.numeric(gsub("[^0-9]", "", df_countries$population))
# =============================================================
# 2. CLEANING DATA HOCKEY
# =============================================================
df_hockey_clean <- df_hockey %>% distinct()
cols_hockey <- colnames(df_hockey_clean)
# WAJIB LOOPING & IF-ELSE
for (h_col in cols_hockey) {
df_hockey_clean[[h_col]] <- trimws(tolower(df_hockey_clean[[h_col]]))
if (sum(is.na(df_hockey_clean[[h_col]])) > 0) {
if (h_col %in% c("wins", "points", "year")) {
df_hockey_clean[[h_col]][is.na(df_hockey_clean[[h_col]])] <- "0"
} else {
df_hockey_clean[[h_col]][is.na(df_hockey_clean[[h_col]])] <- "none"
}
}
}
# Convert Tipe Data
df_hockey_clean <- df_hockey_clean %>%
mutate(year = as.integer(year), wins = as.numeric(wins), points = as.numeric(points))
# =============================================================
# OUTPUT DASHBOARD (Countries & Hockey)
# =============================================================
browsable(
tagList(
tags$div(style = "background: linear-gradient(135deg, #4caf50 0%, #2e7d32 100%); padding: 20px; border-radius: 10px; color: white; font-family: sans-serif; margin-bottom: 25px;",
tags$h2(style = "margin:0;", "✨ Section C: Data Cleaning Success"),
tags$p(style = "margin:5px 0 0 0; opacity: 0.9;", "")
),
# --- PREVIEW 1: COUNTRIES ---
tags$div(style = "margin-bottom: 40px;",
tags$h4("🌍 Cleaned Data: Countries of the World", style="font-family:sans-serif; color: #1e3c72;"),
datatable(df_countries_clean, options = list(pageLength = 5, scrollX = TRUE),
rownames = FALSE, class = 'cell-border stripe')
),
# --- PREVIEW 2: HOCKEY ---
tags$div(style = "margin-bottom: 20px;",
tags$h4("🏒 Cleaned Data: Hockey Teams", style="font-family:sans-serif; color: #29323c;"),
datatable(df_hockey_clean, options = list(pageLength = 5, scrollX = TRUE),
rownames = FALSE, class = 'cell-border stripe')
)
)
)
```
### SECTION D – CONDITIONAL LOGIC {data-width=1200}
```{r section d, echo=FALSE, message=FALSE, warning=FALSE}
library(tidyverse)
library(htmltools)
library(DT)
# =============================================================
# 1. IMPLEMENTASI LOGIKA PADA DATA COUNTRIES
# =============================================================
# kolom 'data_status' berdasarkan validitas data
df_countries_final <- df_countries_clean %>%
mutate(data_status = NA) # Siapkan kolom baru
for (i in 1:nrow(df_countries_final)) {
# KONDISI 1 & 2: Cek apakah ada elemen yang default/missing
if (df_countries_final$country_name[i] == "unknown" |
df_countries_final$capital[i] == "unknown" |
is.na(df_countries_final$population[i])) {
df_countries_final$data_status[i] <- "Incomplete"
} else if (df_countries_final$population[i] <= 0) {
# KONDISI TAMBAHAN: Cek validitas angka
df_countries_final$data_status[i] <- "Invalid Data"
} else {
# KONDISI 3: Jika semua elemen ditemukan dan valid
df_countries_final$data_status[i] <- "Complete"
}
}
# =============================================================
# 2. IMPLEMENTASI LOGIKA PADA DATA HOCKEY
# =============================================================
df_hockey_final <- df_hockey_clean %>%
mutate(data_status = NA)
for (j in 1:nrow(df_hockey_final)) {
# Logika Gabungan: Cek elemen tidak ditemukan & kelengkapan data
if (df_hockey_final$team_name[j] == "none" | is.na(df_hockey_final$wins[j])) {
# Jika elemen tidak ditemukan
df_hockey_final$data_status[j] <- "Incomplete"
} else if (df_hockey_final$wins[j] == 0 & df_hockey_final$points[j] == 0) {
# Kondisi khusus jika data ada tapi kosong nilainya
df_hockey_final$data_status[j] <- "Potential Error"
} else {
# KONDISI 3: Data valid
df_hockey_final$data_status[j] <- "Complete"
}
}
# =============================================================
# OUTPUT VISUALISASI
# =============================================================
browsable(
tagList(
# Header Section D
tags$div(style = "background: linear-gradient(135deg, #673ab7 0%, #512da8 100%); padding: 20px; border-radius: 10px; color: white; font-family: sans-serif; margin-bottom: 25px;",
tags$h2(style = "margin:0;", "🧠 Section D: Conditional Logic Implementation"),
tags$p(style = "margin:5px 0 0 0; opacity: 0.9;", "")
),
# TABEL COUNTRIES
tags$h4("🌍 Countries Data Status:", style="font-family:sans-serif; color: #512da8;"),
datatable(df_countries_final, options = list(pageLength = 5, scrollX = TRUE), rownames = FALSE),
tags$br(),
# TABEL HOCKEY
tags$h4("🏒 Hockey Data Status:", style="font-family:sans-serif; color: #512da8;"),
datatable(df_hockey_final, options = list(pageLength = 5, scrollX = TRUE), rownames = FALSE)
)
)
```
### SECTION E – ANALYTICAL THINKING {data-width=1200}
```{r section-e-analysis, echo=FALSE, message=FALSE, warning=FALSE}
library(htmltools)
browsable(
tagList(
# --- HEADER SECTION E ---
tags$div(style = "background: linear-gradient(135deg, #009688 0%, #00796b 100%); padding: 20px; border-radius: 10px; color: white; font-family: sans-serif; margin-bottom: 25px;",
tags$h2(style = "margin:0;", "💡 Section E: Analytical Thinking Report"),
tags$p(style = "margin:5px 0 0 0; opacity: 0.9;", "")
),
# --- BOX 1: DIFFICULTY ANALYSIS (POIN 1 & 2) ---
tags$div(style = "background: #e0f2f1; border-left: 5px solid #009688; padding: 20px; border-radius: 8px; margin-bottom: 20px; font-family: sans-serif; box-shadow: 2px 2px 5px rgba(0,0,0,0.05);",
tags$h4(style = "color: #00796b; margin-top: 0;", "1 & 2. Website Difficulty Analysis"),
tags$p(tags$b("Website Paling Mudah: Countries of the World."),
" Karena bersifat Static HTML. Semua data tersedia langsung di source code awal, strukturnya konsisten, dan tidak memerlukan interaksi user."),
tags$p(tags$b("Website Paling Sulit: Hockey Teams."),
" Karena menggunakan sistem Pagination. Scraping memerlukan logika perulangan (looping) URL dan penanganan data duplikat (terdeteksi 50 baris) yang muncul di halaman berbeda.")
),
# --- BOX 2: TECHNICAL COMPARISON (POIN 3) ---
tags$div(style = "background: #fff3e0; border-left: 5px solid #ff9800; padding: 20px; border-radius: 8px; margin-bottom: 20px; font-family: sans-serif; box-shadow: 2px 2px 5px rgba(0,0,0,0.05);",
tags$h4(style = "color: #e65100; margin-top: 0;", "3. Perbedaan Pendekatan Scraping"),
tags$table(style = "width: 100%; border-collapse: collapse;",
tags$tr(tags$td(style="padding: 5px; width: 100px;", tags$b("Static")), tags$td(": Data tertulis langsung di HTML (sekali request).")),
tags$tr(tags$td(style="padding: 5px;", tags$b("Pagination")), tags$td(": Data terbagi di banyak halaman (butuh looping URL).")),
tags$tr(tags$td(style="padding: 5px;", tags$b("AJAX")), tags$td(": Data dimuat dinamis via JavaScript (butuh API/Selenium).")),
tags$tr(tags$td(style="padding: 5px;", tags$b("Iframe")), tags$td(": Data dibungkus dalam frame terpisah (butuh URL src)."))
)
),
# --- BOX 3: INSIGHTS & RECOMMENDATIONS (POIN 4) ---
tags$div(style = "background: #e3f2fd; border-left: 5px solid #2196f3; padding: 20px; border-radius: 8px; font-family: sans-serif; box-shadow: 2px 2px 5px rgba(0,0,0,0.05);",
tags$h4(style = "color: #1565c0; margin-top: 0;", "4. Data Strategy (Insights & Recommendations)"),
tags$div(style = "display: flex; gap: 20px;",
tags$div(style = "flex: 1;",
tags$b("💡 Key Insights:"),
tags$ul(style="font-size: 0.9em;",
tags$li("Sistem pagination berisiko tinggi menghasilkan redundansi data (50 duplikat)."),
tags$li("Data mentah rvest selalu bertipe Character, wajib casting ke Numeric."),
tags$li("Struktur HTML statis jauh lebih efisien dalam penggunaan resource scraping.")
)
),
tags$div(style = "flex: 1;",
tags$b("🚀 Recommendations:"),
tags$ul(style="font-size: 0.9em;",
tags$li("Wajib menyertakan fungsi distinct() pada setiap pipeline scraping dinamis."),
tags$li("Gunakan validasi otomatis (Section D) untuk menandai data Incomplete.")
)
)
)
)
)
)
```
**REFERENSI**
=======================================================================
```{r section-references, echo=FALSE, message=FALSE, warning=FALSE}
library(htmltools)
browsable(
tagList(
# Header
tags$div(style = "background: #37474f; padding: 20px; border-radius: 10px; color: white; font-family: sans-serif; margin-bottom: 20px;",
tags$h2(style = "margin:0;", "📚 Daftar Referensi & Dokumentasi"),
tags$p(style = "margin:5px 0 0 0; opacity: 0.8;", "Sumber pustaka dan dokumentasi library pendukung yang digunakan.")
),
# List Referensi
tags$div(style = "background: #f5f5f5; border: 1px solid #cfd8dc; padding: 20px; border-radius: 8px; font-family: sans-serif;",
tags$ul(style = "line-height: 1.8;",
# Referensi dari lu
tags$li(tags$b("Data Collection: "), "[https://bookdown.org/dsciencelabs/data_science_programming/04-Data_Collection.html](https://bookdown.org/dsciencelabs/data_science_programming/04-Data_Collection.html)"),
tags$li(tags$b("Data Cleaning: "), "[https://bookdown.org/dsciencelabs/data_science_programming/05-Data-Cleaning.html](https://bookdown.org/dsciencelabs/data_science_programming/05-Data-Cleaning.html)"),
tags$li(tags$b("Programming Logic: "), "[https://bookdown.org/dsciencelabs/data_science_programming/03-Functions-and-Loops.html](https://bookdown.org/dsciencelabs/data_science_programming/03-Functions-and-Loops.html)"),
# Tambahan referensi teknis dari gua
tags$li(tags$b("Interactive Tables: "), "[https://rstudio.github.io/DT/](https://rstudio.github.io/DT/)"),
tags$li(tags$b("Data Manipulation: "), "[https://tidyverse.tidyverse.org/](https://tidyverse.tidyverse.org/)"),
tags$li(tags$b("HTML UI in R: "), "[https://rstudio.github.io/htmltools/](https://rstudio.github.io/htmltools/)")
)
),
# Footer
tags$p(style = "text-align: center; color: #999; margin-top: 30px; font-family: sans-serif; font-size: 0.8em;",
"© 2026 | UTS Data Science - Institut Teknologi Sains Bandung")
)
)
```