Putri Adria Garini
NIM: 52250002Hirose Kawarin Sirait
NIM: 52250012Cecilia Mutiara Handayani
NIM: 52250013read_file <- function(filepath) {
ext <- tolower(tools::file_ext(filepath))
if (ext == "csv") {
return(readr::read_csv(filepath, show_col_types = FALSE))
} else if (ext == "xlsx") {
return(readxl::read_excel(filepath))
} else if (ext == "json") {
data <- jsonlite::fromJSON(filepath)
return(as.data.frame(data))
} else if (ext == "txt") {
return(readr::read_delim(filepath, delim = "|", show_col_types = FALSE))
} else if (ext == "xml") {
tree <- xml2::read_xml(filepath)
records <- xml2::xml_find_all(tree, ".//*[not(*)]/..")
rows <- lapply(records, function(record) {
children <- xml2::xml_children(record)
setNames(as.list(xml2::xml_text(children)), xml2::xml_name(children))
})
return(as.data.frame(do.call(rbind, lapply(rows, as.data.frame))))
} else {
stop(paste("Format file tidak dikenal:", ext))
}
}
file_list <- c("ecommerce.csv", "ecommerce.xlsx", "ecommerce.json", "ecommerce.txt", "ecommerce.xml")
dataframes <- list()
for (filename in file_list) {
df <- read_file(filename)
dataframes[[filename]] <- df
}
df_combined <- do.call(rbind, dataframes)
rownames(df_combined) <- NULLTujuan: Mengambil dan menggabungkan data dari berbagai sumber menggunakan Python
## Warning: package 'readr' was built under R version 4.5.3
##
## Attaching package: 'readr'
## The following object is masked _by_ '.GlobalEnv':
##
## read_file
## Warning: package 'readxl' was built under R version 4.5.3
## Warning: package 'jsonlite' was built under R version 4.5.3
## Warning: package 'xml2' was built under R version 4.5.3
## Semua library berhasil diimport
# ============================================================
# SECTION A — TUGAS 1, 2, 3: Baca file, looping, cek struktur
# ============================================================
library(readr)
library(readxl)
library(jsonlite)
library(xml2)
read_file <- function(filepath) {
ext <- tolower(tools::file_ext(filepath))
if (ext == "csv") {
return(read_csv(filepath, show_col_types = FALSE))
} else if (ext == "xlsx") {
return(read_excel(filepath))
} else if (ext == "json") {
data <- fromJSON(filepath)
return(as.data.frame(data))
} else if (ext == "txt") {
return(read_delim(filepath, delim = "|", show_col_types = FALSE))
} else if (ext == "xml") {
tree <- read_xml(filepath)
records <- xml_find_all(tree, ".//*[not(*)]/..")
rows <- lapply(records, function(record) {
children <- xml_children(record)
setNames(as.list(xml_text(children)), xml_name(children))
})
return(as.data.frame(do.call(rbind, lapply(rows, as.data.frame))))
} else {
stop(paste("Format file tidak dikenal:", ext))
}
}
file_list <- c("ecommerce.csv", "ecommerce.xlsx", "ecommerce.json", "ecommerce.txt", "ecommerce.xml")
reference_columns <- NULL
dataframes <- list()
summary_list <- list()
for (filename in file_list) {
df <- read_file(filename)
dataframes[[filename]] <- df
if (is.null(reference_columns)) {
reference_columns <- colnames(df)
status <- "Referensi Kolom"
} else {
if (identical(colnames(df), reference_columns)) {
status <- "Ready to Merge"
} else {
status <- "Need Adjustment"
}
}
summary_list[[filename]] <- data.frame(
File = filename,
Jumlah_Baris = nrow(df),
Jumlah_Kolom = ncol(df),
Status = status
)
}
summary_df <- do.call(rbind, summary_list)Ringkasan pembacaan 5 file dataset:
library(knitr)
library(kableExtra)
library(dplyr)
kable(summary_df, align = "c") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "bordered"),
full_width = TRUE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")| File | Jumlah_Baris | Jumlah_Kolom | Status | |
|---|---|---|---|---|
| ecommerce.csv | ecommerce.csv | 2000 | 22 | Referensi Kolom |
| ecommerce.xlsx | ecommerce.xlsx | 2000 | 22 | Ready to Merge |
| ecommerce.json | ecommerce.json | 2000 | 22 | Ready to Merge |
| ecommerce.txt | ecommerce.txt | 2000 | 22 | Ready to Merge |
| ecommerce.xml | ecommerce.xml | 2000 | 22 | Ready to Merge |
# ============================================================
# SECTION A — TUGAS 4: Gabungkan semua data
# ============================================================
df_combined <- do.call(rbind, dataframes)
rownames(df_combined) <- NULL
# Info hasil gabungan
gabung_info <- data.frame(
Keterangan = c("Total Baris", "Total Kolom"),
Nilai = c(nrow(df_combined), ncol(df_combined)),
stringsAsFactors = FALSE
)Hasil penggabungan dataset:
library(knitr)
library(kableExtra)
library(dplyr)
kable(gabung_info, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")| Keterangan | Nilai |
|---|---|
| Total Baris | 10000 |
| Total Kolom | 22 |
Preview 5 baris pertama dataset gabungan:
library(knitr)
library(kableExtra)
library(dplyr)
kable(head(df_combined, 5)) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "bordered"),
full_width = TRUE,
font_size = 12
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
scroll_box(width = "100%")| order_id | order_date | ship_date | platform | category | product_name | unit_price | quantity | gross_sales | campaign | voucher_code | discount_pct | discount_value | shipping_cost | net_sales | payment_method | customer_segment | region | stock_status | order_status | customer_rating | priority_flag |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ORD00612 | 2024-04-19 | 2024/04/24 | Tokopedia | home living | Table Lamp | 188905 | 4 | 755620 | Flash Sale | DISC10 | 10 | 75562 | 12000 | 680058 | E-Wallet | VIP | Bekasi | Preorder | completed | 5 | Y |
| ORD00112 | 2024/01/24 | 29/01/2024 | TikTok Shop | Electronics | Power Bank | 1476873 | 1 | 1476873 | Normal Day | NONE | 0 | 0 | 18000 | 1476873 | cod | Returning | Makassar | In Stock | completed | 5 | N |
| ORD01186 | 2024-06-12 | 06-19-2024 | Tokopedia | Fashion | Women Dress | 231072 | 2 | 462144 | Mega Campaign | DISC20 | 20 | 92429 | 15000 | 369715 | Virtual Account | Returning | Surabaya | In Stock | delivered | 3 | Yes |
| ORD01511 | 2024/08/07 | 08-14-2024 | Tokopedia | home living | Vacuum Cleaner | 512063 | 3 | 1536189 | Flash Sale | DISC10 | 10 | 153619 | 0 | 1382570 | Transfer Bank | New | Yogyakarta | In Stock | DELIVERED | 4 | No |
| ORD00772 | 2024-12-08 | NA | Tokopedia | Beauty | Body Lotion | 221586 | 2 | 443172 | Payday Sale | DISC15 | 15 | Rp 66.476 | 0 | 376696 | COD | Returning | Surabaya | In Stock | DELIVERED | 5 | normal |
Tujuan: Memahami kondisi dataset hasil penggabungan
# ============================================================
# SECTION B — TUGAS 1: Info dataset
# ============================================================
if (!exists("df_combined") || is.null(df_combined)) {
stop("df_combined belum tersedia. Jalankan proses penggabungan data terlebih dahulu.")
}
dtypes_df <- data.frame(
Kolom = names(df_combined),
Tipe_Data = sapply(df_combined, class),
stringsAsFactors = FALSE
)
info_df <- data.frame(
Keterangan = c("Jumlah Total Baris", "Jumlah Total Kolom"),
Nilai = c(nrow(df_combined), ncol(df_combined)),
stringsAsFactors = FALSE
)Informasi umum dataset:
library(knitr)
library(kableExtra)
library(dplyr)
kable(info_df, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")| Keterangan | Nilai |
|---|---|
| Jumlah Total Baris | 10000 |
| Jumlah Total Kolom | 22 |
Tipe data setiap kolom:
library(knitr)
library(kableExtra)
library(dplyr)
kable(dtypes_df, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")| Kolom | Tipe_Data | |
|---|---|---|
| order_id | order_id | character |
| order_date | order_date | character |
| ship_date | ship_date | character |
| platform | platform | character |
| category | category | character |
| product_name | product_name | character |
| unit_price | unit_price | character |
| quantity | quantity | character |
| gross_sales | gross_sales | character |
| campaign | campaign | character |
| voucher_code | voucher_code | character |
| discount_pct | discount_pct | character |
| discount_value | discount_value | character |
| shipping_cost | shipping_cost | character |
| net_sales | net_sales | character |
| payment_method | payment_method | character |
| customer_segment | customer_segment | character |
| region | region | character |
| stock_status | stock_status | character |
| order_status | order_status | character |
| customer_rating | customer_rating | character |
| priority_flag | priority_flag | character |
# ============================================================
# SECTION B — TUGAS 2: Missing Values & Duplicates
# ============================================================
missing <- colSums(is.na(df_combined))
missing_pct <- round((missing / nrow(df_combined)) * 100, 2)
missing_df <- data.frame(
Kolom = names(missing),
Jumlah_Missing = as.numeric(missing),
Persentase = as.numeric(missing_pct),
stringsAsFactors = FALSE
)
# Ambil hanya yang ada missing
missing_df <- missing_df[missing_df$Jumlah_Missing > 0, ]
# Duplicate rows
dup_df <- data.frame(
Keterangan = "Jumlah Baris Duplikat",
Nilai = sum(duplicated(df_combined)),
stringsAsFactors = FALSE
)Missing values per kolom:
library(knitr)
library(kableExtra)
library(dplyr)
kable(missing_df, align = "lcc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#e74c3c", color = "white")| Kolom | Jumlah_Missing | Persentase | |
|---|---|---|---|
| 3 | ship_date | 651 | 6.51 |
| 11 | voucher_code | 196 | 1.96 |
| 12 | discount_pct | 276 | 2.76 |
| 16 | payment_method | 140 | 1.40 |
| 21 | customer_rating | 1614 | 16.14 |
| 22 | priority_flag | 752 | 7.52 |
Duplikasi data:
library(knitr)
library(kableExtra)
library(dplyr)
kable(dup_df, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")| Keterangan | Nilai |
|---|---|
| Jumlah Baris Duplikat | 5581 |
# ============================================================
# SECTION B — TUGAS 3: Masalah Kualitas Data
# ============================================================
kualitas_df <- data.frame(
No = 1:5,
Masalah = c(
"Inkonsistensi Penulisan Platform",
"Inkonsistensi Penulisan Order Status",
"Missing Values di Kolom Penting",
"Tipe Data Tidak Konsisten",
"Duplikasi Data"
),
Keterangan = c(
'"shopee", "SHOPEE", " shopee ", "Shopee" -> entitas sama',
'"delivered", "DELIVERED", "Batal", "CANCEL" -> perlu satu standar',
"customer_rating, payment_method, ship_date banyak kosong",
"net_sales bertipe object; format tanggal tidak seragam",
"Data dari 5 file berbeda berpotensi mengandung baris yang sama"
),
stringsAsFactors = FALSE
)
library(knitr)
library(kableExtra)
library(dplyr)
kable(kualitas_df, align = "cll") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#f39c12", color = "white")| No | Masalah | Keterangan |
|---|---|---|
| 1 | Inkonsistensi Penulisan Platform | “shopee”, “SHOPEE”, ” shopee “,”Shopee” -> entitas sama |
| 2 | Inkonsistensi Penulisan Order Status | “delivered”, “DELIVERED”, “Batal”, “CANCEL” -> perlu satu standar |
| 3 | Missing Values di Kolom Penting | customer_rating, payment_method, ship_date banyak kosong |
| 4 | Tipe Data Tidak Konsisten | net_sales bertipe object; format tanggal tidak seragam |
| 5 | Duplikasi Data | Data dari 5 file berbeda berpotensi mengandung baris yang sama |
Daftar masalah kualitas data:
library(knitr)
library(kableExtra)
library(dplyr)
kable(kualitas_df, align = "clll") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "bordered"),
full_width = TRUE
) %>%
row_spec(0, bold = TRUE, background = "#e67e22", color = "white")| No | Masalah | Keterangan |
|---|---|---|
| 1 | Inkonsistensi Penulisan Platform | “shopee”, “SHOPEE”, ” shopee “,”Shopee” -> entitas sama |
| 2 | Inkonsistensi Penulisan Order Status | “delivered”, “DELIVERED”, “Batal”, “CANCEL” -> perlu satu standar |
| 3 | Missing Values di Kolom Penting | customer_rating, payment_method, ship_date banyak kosong |
| 4 | Tipe Data Tidak Konsisten | net_sales bertipe object; format tanggal tidak seragam |
| 5 | Duplikasi Data | Data dari 5 file berbeda berpotensi mengandung baris yang sama |
Tujuan: Membersihkan data menggunakan logika programming
# ============================================================
# SECTION C — 1. Standardisasi Platform
# ============================================================
library(dplyr)
# Sebelum cleaning
platform_before <- df_clean %>%
count(platform, name = "Jumlah_Sebelum") %>%
rename(Platform = platform)
# Fungsi standardisasi
standardize_platform <- function(val) {
if (is.na(val)) return("Unknown")
val_clean <- tolower(trimws(val))
if (grepl("shopee", val_clean)) {
return("Shopee")
} else if (grepl("tokopedia|tokped", val_clean)) {
return("Tokopedia")
} else if (grepl("tiktok", val_clean)) {
return("TikTok Shop")
} else if (grepl("lazada", val_clean)) {
return("Lazada")
} else if (grepl("blibli", val_clean)) {
return("Blibli")
} else {
return(tools::toTitleCase(trimws(val)))
}
}
# Terapkan ke data
df_clean$platform <- sapply(df_clean$platform, standardize_platform)
# Setelah cleaning
platform_after <- df_clean %>%
count(platform, name = "Jumlah_Setelah") %>%
rename(Platform = platform)Platform — Sebelum standardisasi:
library(knitr)
library(kableExtra)
library(dplyr)
kable(platform_before, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#8e44ad", color = "white")| Platform | Jumlah_Sebelum |
|---|---|
| blibli | 33 |
| lazada | 36 |
| shopee | 39 |
| tiktok shop | 51 |
| tokopedia | 18 |
| BLIBLI | 50 |
| Blibli | 1820 |
| LAZADA | 50 |
| Lazada | 1760 |
| SHOPEE | 75 |
| Shopee | 1895 |
| TIKTOK SHOP | 30 |
| TOKOPEDIA | 95 |
| TikTok Shop | 1840 |
| Tiktok Shop | 35 |
| Tokopedia | 1765 |
| blibli | 117 |
| lazada | 94 |
| shopee | 71 |
| tiktok shop | 74 |
| tokopedia | 52 |
Platform — Setelah standardisasi:
library(knitr)
library(kableExtra)
library(dplyr)
kable(platform_after, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#27ae60", color = "white")| Platform | Jumlah_Setelah |
|---|---|
| Blibli | 2020 |
| Lazada | 1940 |
| Shopee | 2080 |
| TikTok Shop | 2030 |
| Tokopedia | 1930 |
# ============================================================
# SECTION C — 2. Cleaning net_sales
# ============================================================
clean_sales <- function(val) {
if (is.na(val)) return(0)
val_str <- trimws(as.character(val))
# Hilangkan "Rp" dan pemisah ribuan
val_str <- gsub("Rp", "", val_str)
val_str <- gsub("\\.", "", val_str)
val_str <- gsub(",", "", val_str)
# Ambil hanya angka dan tanda minus
val_str <- gsub("[^0-9-]", "", val_str)
# Konversi ke numeric
val_num <- suppressWarnings(as.numeric(val_str))
if (is.na(val_num) || val_num < 0) return(0)
return(val_num)
}
# Terapkan ke kolom
df_clean$net_sales <- sapply(df_clean$net_sales, clean_sales)
# Statistik deskriptif
desc <- data.frame(
Statistik = names(summary(df_clean$net_sales)),
Nilai = as.numeric(summary(df_clean$net_sales))
)
desc$Nilai <- round(desc$Nilai, 2)
# Info tambahan
netsales_info <- data.frame(
Keterangan = c("Tipe Data Setelah Cleaning", "Jumlah Nilai Negatif"),
Nilai = c(class(df_clean$net_sales), sum(df_clean$net_sales < 0)),
stringsAsFactors = FALSE
)Info kolom net_sales setelah cleaning:
library(knitr)
library(kableExtra)
library(dplyr)
kable(netsales_info, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")| Keterangan | Nilai |
|---|---|
| Tipe Data Setelah Cleaning | numeric |
| Jumlah Nilai Negatif | 0 |
Statistik deskriptif net_sales:
library(knitr)
library(kableExtra)
library(dplyr)
kable(desc, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")| Statistik | Nilai |
|---|---|
| Min. | 0 |
| 1st Qu. | 247700 |
| Median | 708293 |
| Mean | 1377342 |
| 3rd Qu. | 1653688 |
| Max. | 18080224 |
# ============================================================
# SECTION C — 3. Handling Missing Values
# ============================================================
library(dplyr)
# Cleaning payment_method
df_clean$payment_method <- sapply(df_clean$payment_method, function(x) {
if (is.na(x) || trimws(x) == "") {
return("Unknown")
} else {
return(trimws(as.character(x)))
}
})
# Cleaning customer_rating
df_clean$customer_rating <- as.numeric(df_clean$customer_rating)
median_rating <- median(df_clean$customer_rating, na.rm = TRUE)
df_clean$customer_rating[is.na(df_clean$customer_rating)] <- median_rating
# Ringkasan hasil
mv_result <- data.frame(
Kolom = c("payment_method", "customer_rating"),
Missing_Setelah = c(
sum(is.na(df_clean$payment_method)),
sum(is.na(df_clean$customer_rating))
),
Metode_Imputasi = c(
'Diisi "Unknown"',
paste("Diisi median =", round(median_rating, 2))
),
stringsAsFactors = FALSE
)Hasil handling missing values:
library(knitr)
library(kableExtra)
library(dplyr)
kable(mv_result, align = "lcc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#27ae60", color = "white")| Kolom | Missing_Setelah | Metode_Imputasi |
|---|---|---|
| payment_method | 0 | Diisi “Unknown” |
| customer_rating | 0 | Diisi median = 5 |
# ============================================================
# SECTION C — 4. Standardisasi order_status
# ============================================================
library(dplyr)
# Sebelum cleaning
status_before <- df_clean %>%
count(order_status, name = "Jumlah_Sebelum") %>%
rename(`Order Status` = order_status)
# Fungsi standardisasi
standardize_status <- function(val) {
if (is.na(val)) return("Unknown")
val_clean <- tolower(trimws(val))
if (val_clean %in% c("delivered", "completed", "complete")) {
return("Completed")
} else if (val_clean %in% c("cancelled", "cancel", "batal")) {
return("Cancelled")
} else if (val_clean %in% c("shipped", "on delivery", "on_delivery")) {
return("Shipped")
} else if (val_clean %in% c("returned", "retur", "return")) {
return("Returned")
} else {
return(tools::toTitleCase(trimws(val)))
}
}
# Terapkan ke data
df_clean$order_status <- sapply(df_clean$order_status, standardize_status)
# Setelah cleaning
status_after <- df_clean %>%
count(order_status, name = "Jumlah_Setelah") %>%
rename(`Order Status` = order_status)Order Status — Sebelum standardisasi:
library(knitr)
library(kableExtra)
library(dplyr)
kable(status_before, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#8e44ad", color = "white")| Order Status | Jumlah_Sebelum |
|---|---|
| batal | 18 |
| cancelled | 6 |
| completed | 42 |
| delivered | 75 |
| on delivery | 6 |
| retur | 6 |
| returned | 9 |
| shipped | 3 |
| BATAL | 5 |
| Batal | 130 |
| CANCEL | 180 |
| CANCELLED | 5 |
| COMPLETED | 55 |
| Cancelled | 230 |
| DELIVERED | 1980 |
| Delivered | 1940 |
| ON DELIVERY | 10 |
| On Delivery | 220 |
| RETUR | 205 |
| Returned | 165 |
| SHIPPED | 5 |
| Shipped | 265 |
| batal | 12 |
| cancelled | 174 |
| completed | 1903 |
| delivered | 1905 |
| on delivery | 4 |
| retur | 4 |
| returned | 201 |
| shipped | 237 |
Order Status — Setelah standardisasi:
library(knitr)
library(kableExtra)
library(dplyr)
kable(status_after, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#27ae60", color = "white")| Order Status | Jumlah_Setelah |
|---|---|
| Cancelled | 760 |
| Completed | 7900 |
| Returned | 590 |
| Shipped | 750 |
# ============================================================
# SECTION C — 5. Looping: Cleaning 3 kolom teks + hapus duplikat
# ============================================================
library(dplyr)
text_columns <- c("category", "product_name", "region")
loop_summary <- data.frame()
for (col in text_columns) {
sebelum <- length(unique(df_clean[[col]]))
df_clean[[col]] <- sapply(df_clean[[col]], function(x) {
if (!is.na(x)) {
return(tools::toTitleCase(trimws(as.character(x))))
} else {
return(x)
}
})
sesudah <- length(unique(df_clean[[col]]))
loop_summary <- rbind(loop_summary, data.frame(
Kolom = col,
Unique_Sebelum = sebelum,
Unique_Setelah = sesudah,
stringsAsFactors = FALSE
))
}
# Hapus duplikat
n_before <- nrow(df_clean)
df_clean <- df_clean[!duplicated(df_clean), ]
n_removed <- n_before - nrow(df_clean)
# Dataframe hasil
loop_df <- loop_summary
dup_removed_df <- data.frame(
Keterangan = c("Baris Sebelum", "Duplikat Dihapus", "Baris Setelah"),
Nilai = c(n_before, n_removed, nrow(df_clean)),
stringsAsFactors = FALSE
)Hasil cleaning kolom teks (looping):
library(knitr)
library(kableExtra)
library(dplyr)
kable(loop_df, align = "lcc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")| Kolom | Unique_Sebelum | Unique_Setelah |
|---|---|---|
| category | 31 | 13 |
| product_name | 49 | 25 |
| region | 8 | 8 |
Hasil penghapusan duplikat:
library(knitr)
library(kableExtra)
library(dplyr)
kable(dup_removed_df, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")| Keterangan | Nilai |
|---|---|
| Baris Sebelum | 10000 |
| Duplikat Dihapus | 7525 |
| Baris Setelah | 2475 |
Tujuan: Menerapkan logika bisnis menggunakan if / if-else
# ============================================================
# SECTION D — 1. Kolom is_high_value
# ============================================================
library(dplyr)
df_clean$net_sales <- as.numeric(df_clean$net_sales)
df_clean$net_sales[is.na(df_clean$net_sales)] <- 0
df_clean$is_high_value <- ifelse(df_clean$net_sales > 1000000, "Yes", "No")
hv_df <- df_clean %>%
count(is_high_value, name = "Jumlah")## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(
hv_df,
labels = ~is_high_value,
values = ~Jumlah,
type = "pie",
hole = 0.6,
textinfo = "label+percent",
showlegend = FALSE,
marker = list(
colors = c("#d9ecff", "#001f7a")
)
) %>%
layout(
title = "Distribusi High Value Transaction"
)Distribusi kolom is_high_value:
library(knitr)
library(kableExtra)
library(dplyr)
kable(hv_df, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")| is_high_value | Jumlah |
|---|---|
| No | 1575 |
| Yes | 900 |
# ============================================================
# SECTION D — 2. Kolom order_priority (nested IF)
# ============================================================
library(dplyr)
# Buat kategori order_priority
df_clean$order_priority <- ifelse(
df_clean$net_sales > 1000000, "High",
ifelse(df_clean$net_sales >= 500000, "Medium", "Low")
)
# Ringkasan jumlah
op_df <- df_clean %>%
count(order_priority, name = "Jumlah")library(plotly)
plot_ly(
op_df,
labels = ~order_priority,
values = ~Jumlah,
type = "pie",
hole = 0.6,
textinfo = "label+percent",
showlegend = FALSE,
marker = list(
colors = c(
"#d9ecff",
"#5fa8ff",
"#001f7a"
)
)
) %>%
layout(
title = "Distribusi Order Priority"
)Distribusi kolom order_priority:
library(knitr)
library(kableExtra)
library(dplyr)
kable(op_df, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")| order_priority | Jumlah |
|---|---|
| High | 900 |
| Low | 1095 |
| Medium | 480 |
# ============================================================
# SECTION D — 3. Kolom valid_transaction
# ============================================================
library(dplyr)
# Validasi transaksi
df_clean$valid_transaction <- ifelse(
df_clean$order_status == "Cancelled",
"Invalid",
"Valid"
)
# Ringkasan
vt_df <- df_clean %>%
count(valid_transaction, name = "Jumlah")
# Preview data (5 baris pertama)
preview_df <- df_clean %>%
select(order_id, platform, net_sales, order_status,
is_high_value, order_priority, valid_transaction) %>%
head(5)library(plotly)
plot_ly(
vt_df,
labels = ~valid_transaction,
values = ~Jumlah,
type = "pie",
hole = 0.6,
textinfo = "label+percent",
showlegend = FALSE,
marker = list(
colors = c(
"#a8d1ff",
"#001f7a"
)
)
) %>%
layout(
title = "Distribusi Valid Transaction"
)Distribusi kolom valid_transaction:
library(knitr)
library(kableExtra)
library(dplyr)
kable(vt_df, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")| valid_transaction | Jumlah |
|---|---|
| Invalid | 323 |
| Valid | 2152 |
Preview dataset final (5 baris pertama):
library(knitr)
library(kableExtra)
library(dplyr)
kable(preview_df) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "bordered"),
full_width = TRUE
) %>%
row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
scroll_box(width = "100%")| order_id | platform | net_sales | order_status | is_high_value | order_priority | valid_transaction |
|---|---|---|---|---|---|---|
| ORD00612 | Tokopedia | 680058 | Completed | No | Medium | Valid |
| ORD00112 | TikTok Shop | 1476873 | Completed | Yes | High | Valid |
| ORD01186 | Tokopedia | 369715 | Completed | No | Low | Valid |
| ORD01511 | Tokopedia | 1382570 | Completed | Yes | High | Valid |
| ORD00772 | Tokopedia | 376696 | Completed | No | Low | Valid |
# Simpan ke CSV
write.csv(df_clean, "ecommerce_cleaned.csv", row.names = FALSE)
# Info file hasil
save_info <- data.frame(
Keterangan = c("File Output", "Total Baris", "Total Kolom"),
Nilai = c("ecommerce_cleaned.csv", nrow(df_clean), ncol(df_clean)),
stringsAsFactors = FALSE
)library(knitr)
library(kableExtra)
library(dplyr)
kable(save_info, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#27ae60", color = "white")| Keterangan | Nilai |
|---|---|
| File Output | ecommerce_cleaned.csv |
| Total Baris | 2475 |
| Total Kolom | 25 |
Tujuan: Menarik insight sederhana dari data
# ============================================================
# SECTION E — Analytical Thinking
# ============================================================
library(dplyr)
# Platform
platform_vc <- df_clean %>%
count(platform, name = "Jumlah_Transaksi") %>%
rename(Platform = platform)
# Category
category_vc <- df_clean %>%
count(category, name = "Jumlah_Transaksi") %>%
rename(Category = category)
# Order Status
status_vc <- df_clean %>%
count(order_status, name = "Jumlah_Transaksi") %>%
rename(`Order Status` = order_status)1. Distribusi transaksi per platform:
library(knitr)
library(kableExtra)
library(dplyr)
kable(platform_vc, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2980b9", color = "white") %>%
row_spec(1, bold = TRUE, background = "#d5e8f5")| Platform | Jumlah_Transaksi |
|---|---|
| Blibli | 514 |
| Lazada | 486 |
| Shopee | 519 |
| TikTok Shop | 481 |
| Tokopedia | 475 |
library(plotly)
library(dplyr)
platform_vc <- df_clean %>%
count(platform, name = "Jumlah_Transaksi") %>%
arrange(desc(Jumlah_Transaksi))
platform_vc$platform <- factor(
platform_vc$platform,
levels = platform_vc$platform
)
plot_ly(
data = platform_vc,
x = ~platform,
y = ~Jumlah_Transaksi,
type = "bar",
text = ~Jumlah_Transaksi,
textposition = "outside",
marker = list(
color = c(
"#d9ecff",
"#a8d1ff",
"#5fa8ff",
"#1f5fd6",
"#001f7a"
)
)
) %>%
layout(
title = "1. Distribusi Transaksi per Platform",
xaxis = list(title = "Platform"),
yaxis = list(title = "Jumlah Transaksi"),
plot_bgcolor = "#f8f9fa"
)2. Distribusi transaksi per kategori:
library(knitr)
library(kableExtra)
library(dplyr)
kable(category_vc, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2980b9", color = "white") %>%
row_spec(1, bold = TRUE, background = "#d5e8f5")| Category | Jumlah_Transaksi |
|---|---|
| BEAUTY | 10 |
| Beauty | 475 |
| ELECTRONICS | 15 |
| Electronics | 439 |
| FASHION | 36 |
| Fashion | 474 |
| HOME LIVING | 13 |
| HOME_LIVING | 2 |
| Home Living | 465 |
| Home_Living | 33 |
| Home_living | 2 |
| SPORTS | 12 |
| Sports | 499 |
library(plotly)
library(dplyr)
category_vc <- df_clean %>%
count(category, name = "Jumlah_Transaksi") %>%
arrange(desc(Jumlah_Transaksi))
category_vc$category <- factor(
category_vc$category,
levels = category_vc$category
)
plot_ly(
data = category_vc,
x = ~category,
y = ~Jumlah_Transaksi,
type = "bar",
text = ~Jumlah_Transaksi,
textposition = "outside",
marker = list(
color = c(
"#d9ecff",
"#b8dcff",
"#8cc8ff",
"#5fa8ff",
"#3b82f6",
"#1f5fd6",
"#001f7a"
)
)
) %>%
layout(
title = "2. Distribusi Transaksi per Category",
xaxis = list(title = "Category"),
yaxis = list(title = "Jumlah Transaksi"),
plot_bgcolor = "#f8f9fa"
)3. Distribusi transaksi per status order:
library(knitr)
library(kableExtra)
library(dplyr)
kable(status_vc, align = "lc") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2980b9", color = "white") %>%
row_spec(1, bold = TRUE, background = "#d5e8f5")| Order Status | Jumlah_Transaksi |
|---|---|
| Cancelled | 323 |
| Completed | 1834 |
| Returned | 144 |
| Shipped | 174 |
library(plotly)
library(dplyr)
status_vc <- df_clean %>%
count(order_status, name = "Jumlah_Transaksi") %>%
arrange(desc(Jumlah_Transaksi))
status_vc$order_status <- factor(
status_vc$order_status,
levels = status_vc$order_status
)
plot_ly(
data = status_vc,
x = ~order_status,
y = ~Jumlah_Transaksi,
type = "bar",
text = ~Jumlah_Transaksi,
textposition = "outside",
marker = list(
color = c(
"#d9ecff",
"#8cc8ff",
"#3b82f6",
"#1f5fd6",
"#001f7a"
)
)
) %>%
layout(
title = "3. Distribusi Transaksi per Order Status",
xaxis = list(title = "Order Status"),
yaxis = list(title = "Jumlah Transaksi"),
plot_bgcolor = "#f8f9fa"
)