Data Science Programming
Dosen: Mr. Bakti Siregar, M.Sc., CDS
R Programming
E-Commerce
Web Scraping
Data Science
Python
Kampus: Institut Teknologi Sains Bandung (ITSB)
Program Studi: Sains Data

Putri Adria Garini

NIM: 52250002

Hirose Kawarin Sirait

NIM: 52250012

Cecilia Mutiara Handayani

NIM: 52250013
read_file <- function(filepath) {
  ext <- tolower(tools::file_ext(filepath))
  
  if (ext == "csv") {
    return(readr::read_csv(filepath, show_col_types = FALSE))
    
  } else if (ext == "xlsx") {
    return(readxl::read_excel(filepath))
    
  } else if (ext == "json") {
    data <- jsonlite::fromJSON(filepath)
    return(as.data.frame(data))
    
  } else if (ext == "txt") {
    return(readr::read_delim(filepath, delim = "|", show_col_types = FALSE))
    
  } else if (ext == "xml") {
    tree <- xml2::read_xml(filepath)
    records <- xml2::xml_find_all(tree, ".//*[not(*)]/..")
    
    rows <- lapply(records, function(record) {
      children <- xml2::xml_children(record)
      setNames(as.list(xml2::xml_text(children)), xml2::xml_name(children))
    })
    
    return(as.data.frame(do.call(rbind, lapply(rows, as.data.frame))))
    
  } else {
    stop(paste("Format file tidak dikenal:", ext))
  }
}

file_list <- c("ecommerce.csv", "ecommerce.xlsx", "ecommerce.json", "ecommerce.txt", "ecommerce.xml")

dataframes <- list()

for (filename in file_list) {
  df <- read_file(filename)
  dataframes[[filename]] <- df
}

df_combined <- do.call(rbind, dataframes)
rownames(df_combined) <- NULL

SECTION A — DATA COLLECTION USING PROGRAMMING

Tujuan: Mengambil dan menggabungkan data dari berbagai sumber menggunakan Python

library(readr)
## Warning: package 'readr' was built under R version 4.5.3
## 
## Attaching package: 'readr'
## The following object is masked _by_ '.GlobalEnv':
## 
##     read_file
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 4.5.3
library(xml2)
## Warning: package 'xml2' was built under R version 4.5.3
cat("Semua library berhasil diimport")
## Semua library berhasil diimport
# ============================================================
# SECTION A — TUGAS 1, 2, 3: Baca file, looping, cek struktur
# ============================================================
library(readr)
library(readxl)
library(jsonlite)
library(xml2)

read_file <- function(filepath) {
  ext <- tolower(tools::file_ext(filepath))
  if (ext == "csv") {
    return(read_csv(filepath, show_col_types = FALSE))
  } else if (ext == "xlsx") {
    return(read_excel(filepath))
  } else if (ext == "json") {
    data <- fromJSON(filepath)
    return(as.data.frame(data))
  } else if (ext == "txt") {
    return(read_delim(filepath, delim = "|", show_col_types = FALSE))
  } else if (ext == "xml") {
    tree <- read_xml(filepath)
    records <- xml_find_all(tree, ".//*[not(*)]/..")
    rows <- lapply(records, function(record) {
      children <- xml_children(record)
      setNames(as.list(xml_text(children)), xml_name(children))
    })
    return(as.data.frame(do.call(rbind, lapply(rows, as.data.frame))))
  } else {
    stop(paste("Format file tidak dikenal:", ext))
  }
}

file_list <- c("ecommerce.csv", "ecommerce.xlsx", "ecommerce.json", "ecommerce.txt", "ecommerce.xml")
reference_columns <- NULL
dataframes <- list()
summary_list <- list()

for (filename in file_list) {
  df <- read_file(filename)
  dataframes[[filename]] <- df
  if (is.null(reference_columns)) {
    reference_columns <- colnames(df)
    status <- "Referensi Kolom"
  } else {
    if (identical(colnames(df), reference_columns)) {
      status <- "Ready to Merge"
    } else {
      status <- "Need Adjustment"
    }
  }
  summary_list[[filename]] <- data.frame(
    File = filename,
    Jumlah_Baris = nrow(df),
    Jumlah_Kolom = ncol(df),
    Status = status
  )
}

summary_df <- do.call(rbind, summary_list)

Ringkasan pembacaan 5 file dataset:

library(knitr)
library(kableExtra)
library(dplyr)

kable(summary_df, align = "c") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "bordered"),
    full_width = TRUE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
File Jumlah_Baris Jumlah_Kolom Status
ecommerce.csv ecommerce.csv 2000 22 Referensi Kolom
ecommerce.xlsx ecommerce.xlsx 2000 22 Ready to Merge
ecommerce.json ecommerce.json 2000 22 Ready to Merge
ecommerce.txt ecommerce.txt 2000 22 Ready to Merge
ecommerce.xml ecommerce.xml 2000 22 Ready to Merge
# ============================================================
# SECTION A — TUGAS 4: Gabungkan semua data
# ============================================================
df_combined <- do.call(rbind, dataframes)
rownames(df_combined) <- NULL

# Info hasil gabungan
gabung_info <- data.frame(
  Keterangan = c("Total Baris", "Total Kolom"),
  Nilai = c(nrow(df_combined), ncol(df_combined)),
  stringsAsFactors = FALSE
)

Hasil penggabungan dataset:

library(knitr)
library(kableExtra)
library(dplyr)

kable(gabung_info, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Keterangan Nilai
Total Baris 10000
Total Kolom 22

Preview 5 baris pertama dataset gabungan:

library(knitr)
library(kableExtra)
library(dplyr)

kable(head(df_combined, 5)) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "bordered"),
    full_width = TRUE,
    font_size = 12
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  scroll_box(width = "100%")
order_id order_date ship_date platform category product_name unit_price quantity gross_sales campaign voucher_code discount_pct discount_value shipping_cost net_sales payment_method customer_segment region stock_status order_status customer_rating priority_flag
ORD00612 2024-04-19 2024/04/24 Tokopedia home living Table Lamp 188905 4 755620 Flash Sale DISC10 10 75562 12000 680058 E-Wallet VIP Bekasi Preorder completed 5 Y
ORD00112 2024/01/24 29/01/2024 TikTok Shop Electronics Power Bank 1476873 1 1476873 Normal Day NONE 0 0 18000 1476873 cod Returning Makassar In Stock completed 5 N
ORD01186 2024-06-12 06-19-2024 Tokopedia Fashion Women Dress 231072 2 462144 Mega Campaign DISC20 20 92429 15000 369715 Virtual Account Returning Surabaya In Stock delivered 3 Yes
ORD01511 2024/08/07 08-14-2024 Tokopedia home living Vacuum Cleaner 512063 3 1536189 Flash Sale DISC10 10 153619 0 1382570 Transfer Bank New Yogyakarta In Stock DELIVERED 4 No
ORD00772 2024-12-08 NA Tokopedia Beauty Body Lotion 221586 2 443172 Payday Sale DISC15 15 Rp 66.476 0 376696 COD Returning Surabaya In Stock DELIVERED 5 normal

SECTION B — DATA HANDLING

Tujuan: Memahami kondisi dataset hasil penggabungan

# ============================================================
# SECTION B — TUGAS 1: Info dataset
# ============================================================
if (!exists("df_combined") || is.null(df_combined)) {
  stop("df_combined belum tersedia. Jalankan proses penggabungan data terlebih dahulu.")
}

dtypes_df <- data.frame(
  Kolom = names(df_combined),
  Tipe_Data = sapply(df_combined, class),
  stringsAsFactors = FALSE
)

info_df <- data.frame(
  Keterangan = c("Jumlah Total Baris", "Jumlah Total Kolom"),
  Nilai = c(nrow(df_combined), ncol(df_combined)),
  stringsAsFactors = FALSE
)

Informasi umum dataset:

library(knitr)
library(kableExtra)
library(dplyr)

kable(info_df, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Keterangan Nilai
Jumlah Total Baris 10000
Jumlah Total Kolom 22

Tipe data setiap kolom:

library(knitr)
library(kableExtra)
library(dplyr)

kable(dtypes_df, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Kolom Tipe_Data
order_id order_id character
order_date order_date character
ship_date ship_date character
platform platform character
category category character
product_name product_name character
unit_price unit_price character
quantity quantity character
gross_sales gross_sales character
campaign campaign character
voucher_code voucher_code character
discount_pct discount_pct character
discount_value discount_value character
shipping_cost shipping_cost character
net_sales net_sales character
payment_method payment_method character
customer_segment customer_segment character
region region character
stock_status stock_status character
order_status order_status character
customer_rating customer_rating character
priority_flag priority_flag character
# ============================================================
# SECTION B — TUGAS 2: Missing Values & Duplicates
# ============================================================
missing <- colSums(is.na(df_combined))
missing_pct <- round((missing / nrow(df_combined)) * 100, 2)

missing_df <- data.frame(
  Kolom = names(missing),
  Jumlah_Missing = as.numeric(missing),
  Persentase = as.numeric(missing_pct),
  stringsAsFactors = FALSE
)

# Ambil hanya yang ada missing
missing_df <- missing_df[missing_df$Jumlah_Missing > 0, ]

# Duplicate rows
dup_df <- data.frame(
  Keterangan = "Jumlah Baris Duplikat",
  Nilai = sum(duplicated(df_combined)),
  stringsAsFactors = FALSE
)

Missing values per kolom:

library(knitr)
library(kableExtra)
library(dplyr)

kable(missing_df, align = "lcc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#e74c3c", color = "white")
Kolom Jumlah_Missing Persentase
3 ship_date 651 6.51
11 voucher_code 196 1.96
12 discount_pct 276 2.76
16 payment_method 140 1.40
21 customer_rating 1614 16.14
22 priority_flag 752 7.52

Duplikasi data:

library(knitr)
library(kableExtra)
library(dplyr)

kable(dup_df, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Keterangan Nilai
Jumlah Baris Duplikat 5581
# ============================================================
# SECTION B — TUGAS 3: Masalah Kualitas Data
# ============================================================
kualitas_df <- data.frame(
  No = 1:5,
  Masalah = c(
    "Inkonsistensi Penulisan Platform",
    "Inkonsistensi Penulisan Order Status",
    "Missing Values di Kolom Penting",
    "Tipe Data Tidak Konsisten",
    "Duplikasi Data"
  ),
  Keterangan = c(
    '"shopee", "SHOPEE", " shopee ", "Shopee" -> entitas sama',
    '"delivered", "DELIVERED", "Batal", "CANCEL" -> perlu satu standar',
    "customer_rating, payment_method, ship_date banyak kosong",
    "net_sales bertipe object; format tanggal tidak seragam",
    "Data dari 5 file berbeda berpotensi mengandung baris yang sama"
  ),
  stringsAsFactors = FALSE
)
library(knitr)
library(kableExtra)
library(dplyr)

kable(kualitas_df, align = "cll") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#f39c12", color = "white")
No Masalah Keterangan
1 Inkonsistensi Penulisan Platform “shopee”, “SHOPEE”, ” shopee “,”Shopee” -> entitas sama
2 Inkonsistensi Penulisan Order Status “delivered”, “DELIVERED”, “Batal”, “CANCEL” -> perlu satu standar
3 Missing Values di Kolom Penting customer_rating, payment_method, ship_date banyak kosong
4 Tipe Data Tidak Konsisten net_sales bertipe object; format tanggal tidak seragam
5 Duplikasi Data Data dari 5 file berbeda berpotensi mengandung baris yang sama

Daftar masalah kualitas data:

library(knitr)
library(kableExtra)
library(dplyr)

kable(kualitas_df, align = "clll") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "bordered"),
    full_width = TRUE
  ) %>%
  row_spec(0, bold = TRUE, background = "#e67e22", color = "white")
No Masalah Keterangan
1 Inkonsistensi Penulisan Platform “shopee”, “SHOPEE”, ” shopee “,”Shopee” -> entitas sama
2 Inkonsistensi Penulisan Order Status “delivered”, “DELIVERED”, “Batal”, “CANCEL” -> perlu satu standar
3 Missing Values di Kolom Penting customer_rating, payment_method, ship_date banyak kosong
4 Tipe Data Tidak Konsisten net_sales bertipe object; format tanggal tidak seragam
5 Duplikasi Data Data dari 5 file berbeda berpotensi mengandung baris yang sama

SECTION C — DATA CLEANING

Tujuan: Membersihkan data menggunakan logika programming

df_clean <- df_combined
# ============================================================
# SECTION C — 1. Standardisasi Platform
# ============================================================
library(dplyr)

# Sebelum cleaning
platform_before <- df_clean %>%
  count(platform, name = "Jumlah_Sebelum") %>%
  rename(Platform = platform)

# Fungsi standardisasi
standardize_platform <- function(val) {
  if (is.na(val)) return("Unknown")
  
  val_clean <- tolower(trimws(val))
  
  if (grepl("shopee", val_clean)) {
    return("Shopee")
  } else if (grepl("tokopedia|tokped", val_clean)) {
    return("Tokopedia")
  } else if (grepl("tiktok", val_clean)) {
    return("TikTok Shop")
  } else if (grepl("lazada", val_clean)) {
    return("Lazada")
  } else if (grepl("blibli", val_clean)) {
    return("Blibli")
  } else {
    return(tools::toTitleCase(trimws(val)))
  }
}

# Terapkan ke data
df_clean$platform <- sapply(df_clean$platform, standardize_platform)

# Setelah cleaning
platform_after <- df_clean %>%
  count(platform, name = "Jumlah_Setelah") %>%
  rename(Platform = platform)

Platform — Sebelum standardisasi:

library(knitr)
library(kableExtra)
library(dplyr)

kable(platform_before, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#8e44ad", color = "white")
Platform Jumlah_Sebelum
blibli 33
lazada 36
shopee 39
tiktok shop 51
tokopedia 18
BLIBLI 50
Blibli 1820
LAZADA 50
Lazada 1760
SHOPEE 75
Shopee 1895
TIKTOK SHOP 30
TOKOPEDIA 95
TikTok Shop 1840
Tiktok Shop 35
Tokopedia 1765
blibli 117
lazada 94
shopee 71
tiktok shop 74
tokopedia 52

Platform — Setelah standardisasi:

library(knitr)
library(kableExtra)
library(dplyr)

kable(platform_after, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#27ae60", color = "white")
Platform Jumlah_Setelah
Blibli 2020
Lazada 1940
Shopee 2080
TikTok Shop 2030
Tokopedia 1930
# ============================================================
# SECTION C — 2. Cleaning net_sales
# ============================================================
clean_sales <- function(val) {
  if (is.na(val)) return(0)
  
  val_str <- trimws(as.character(val))
  
  # Hilangkan "Rp" dan pemisah ribuan
  val_str <- gsub("Rp", "", val_str)
  val_str <- gsub("\\.", "", val_str)
  val_str <- gsub(",", "", val_str)
  
  # Ambil hanya angka dan tanda minus
  val_str <- gsub("[^0-9-]", "", val_str)
  
  # Konversi ke numeric
  val_num <- suppressWarnings(as.numeric(val_str))
  
  if (is.na(val_num) || val_num < 0) return(0)
  
  return(val_num)
}

# Terapkan ke kolom
df_clean$net_sales <- sapply(df_clean$net_sales, clean_sales)

# Statistik deskriptif
desc <- data.frame(
  Statistik = names(summary(df_clean$net_sales)),
  Nilai = as.numeric(summary(df_clean$net_sales))
)

desc$Nilai <- round(desc$Nilai, 2)

# Info tambahan
netsales_info <- data.frame(
  Keterangan = c("Tipe Data Setelah Cleaning", "Jumlah Nilai Negatif"),
  Nilai = c(class(df_clean$net_sales), sum(df_clean$net_sales < 0)),
  stringsAsFactors = FALSE
)

Info kolom net_sales setelah cleaning:

library(knitr)
library(kableExtra)
library(dplyr)

kable(netsales_info, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Keterangan Nilai
Tipe Data Setelah Cleaning numeric
Jumlah Nilai Negatif 0

Statistik deskriptif net_sales:

library(knitr)
library(kableExtra)
library(dplyr)

kable(desc, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Statistik Nilai
Min. 0
1st Qu. 247700
Median 708293
Mean 1377342
3rd Qu. 1653688
Max. 18080224
# ============================================================
# SECTION C — 3. Handling Missing Values
# ============================================================
library(dplyr)

# Cleaning payment_method
df_clean$payment_method <- sapply(df_clean$payment_method, function(x) {
  if (is.na(x) || trimws(x) == "") {
    return("Unknown")
  } else {
    return(trimws(as.character(x)))
  }
})

# Cleaning customer_rating
df_clean$customer_rating <- as.numeric(df_clean$customer_rating)

median_rating <- median(df_clean$customer_rating, na.rm = TRUE)

df_clean$customer_rating[is.na(df_clean$customer_rating)] <- median_rating

# Ringkasan hasil
mv_result <- data.frame(
  Kolom = c("payment_method", "customer_rating"),
  Missing_Setelah = c(
    sum(is.na(df_clean$payment_method)),
    sum(is.na(df_clean$customer_rating))
  ),
  Metode_Imputasi = c(
    'Diisi "Unknown"',
    paste("Diisi median =", round(median_rating, 2))
  ),
  stringsAsFactors = FALSE
)

Hasil handling missing values:

library(knitr)
library(kableExtra)
library(dplyr)

kable(mv_result, align = "lcc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#27ae60", color = "white")
Kolom Missing_Setelah Metode_Imputasi
payment_method 0 Diisi “Unknown”
customer_rating 0 Diisi median = 5
# ============================================================
# SECTION C — 4. Standardisasi order_status
# ============================================================
library(dplyr)

# Sebelum cleaning
status_before <- df_clean %>%
  count(order_status, name = "Jumlah_Sebelum") %>%
  rename(`Order Status` = order_status)

# Fungsi standardisasi
standardize_status <- function(val) {
  if (is.na(val)) return("Unknown")
  
  val_clean <- tolower(trimws(val))
  
  if (val_clean %in% c("delivered", "completed", "complete")) {
    return("Completed")
  } else if (val_clean %in% c("cancelled", "cancel", "batal")) {
    return("Cancelled")
  } else if (val_clean %in% c("shipped", "on delivery", "on_delivery")) {
    return("Shipped")
  } else if (val_clean %in% c("returned", "retur", "return")) {
    return("Returned")
  } else {
    return(tools::toTitleCase(trimws(val)))
  }
}

# Terapkan ke data
df_clean$order_status <- sapply(df_clean$order_status, standardize_status)

# Setelah cleaning
status_after <- df_clean %>%
  count(order_status, name = "Jumlah_Setelah") %>%
  rename(`Order Status` = order_status)

Order Status — Sebelum standardisasi:

library(knitr)
library(kableExtra)
library(dplyr)

kable(status_before, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#8e44ad", color = "white")
Order Status Jumlah_Sebelum
batal 18
cancelled 6
completed 42
delivered 75
on delivery 6
retur 6
returned 9
shipped 3
BATAL 5
Batal 130
CANCEL 180
CANCELLED 5
COMPLETED 55
Cancelled 230
DELIVERED 1980
Delivered 1940
ON DELIVERY 10
On Delivery 220
RETUR 205
Returned 165
SHIPPED 5
Shipped 265
batal 12
cancelled 174
completed 1903
delivered 1905
on delivery 4
retur 4
returned 201
shipped 237

Order Status — Setelah standardisasi:

library(knitr)
library(kableExtra)
library(dplyr)

kable(status_after, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#27ae60", color = "white")
Order Status Jumlah_Setelah
Cancelled 760
Completed 7900
Returned 590
Shipped 750
# ============================================================
# SECTION C — 5. Looping: Cleaning 3 kolom teks + hapus duplikat
# ============================================================
library(dplyr)

text_columns <- c("category", "product_name", "region")

loop_summary <- data.frame()

for (col in text_columns) {
  sebelum <- length(unique(df_clean[[col]]))
  
  df_clean[[col]] <- sapply(df_clean[[col]], function(x) {
    if (!is.na(x)) {
      return(tools::toTitleCase(trimws(as.character(x))))
    } else {
      return(x)
    }
  })
  
  sesudah <- length(unique(df_clean[[col]]))
  
  loop_summary <- rbind(loop_summary, data.frame(
    Kolom = col,
    Unique_Sebelum = sebelum,
    Unique_Setelah = sesudah,
    stringsAsFactors = FALSE
  ))
}

# Hapus duplikat
n_before <- nrow(df_clean)

df_clean <- df_clean[!duplicated(df_clean), ]

n_removed <- n_before - nrow(df_clean)

# Dataframe hasil
loop_df <- loop_summary

dup_removed_df <- data.frame(
  Keterangan = c("Baris Sebelum", "Duplikat Dihapus", "Baris Setelah"),
  Nilai = c(n_before, n_removed, nrow(df_clean)),
  stringsAsFactors = FALSE
)

Hasil cleaning kolom teks (looping):

library(knitr)
library(kableExtra)
library(dplyr)

kable(loop_df, align = "lcc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Kolom Unique_Sebelum Unique_Setelah
category 31 13
product_name 49 25
region 8 8

Hasil penghapusan duplikat:

library(knitr)
library(kableExtra)
library(dplyr)

kable(dup_removed_df, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Keterangan Nilai
Baris Sebelum 10000
Duplikat Dihapus 7525
Baris Setelah 2475

SECTION D — CONDITIONAL LOGIC

Tujuan: Menerapkan logika bisnis menggunakan if / if-else

# ============================================================
# SECTION D — 1. Kolom is_high_value
# ============================================================
library(dplyr)

df_clean$net_sales <- as.numeric(df_clean$net_sales)
df_clean$net_sales[is.na(df_clean$net_sales)] <- 0

df_clean$is_high_value <- ifelse(df_clean$net_sales > 1000000, "Yes", "No")

hv_df <- df_clean %>%
  count(is_high_value, name = "Jumlah")
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(
  hv_df,
  labels = ~is_high_value,
  values = ~Jumlah,
  type = "pie",
  hole = 0.6,
  textinfo = "label+percent",
  showlegend = FALSE,
  
  marker = list(
    colors = c("#d9ecff", "#001f7a")
  )
) %>%
  layout(
    title = "Distribusi High Value Transaction"
  )

Distribusi kolom is_high_value:

library(knitr)
library(kableExtra)
library(dplyr)

kable(hv_df, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
is_high_value Jumlah
No 1575
Yes 900
# ============================================================
# SECTION D — 2. Kolom order_priority (nested IF)
# ============================================================
library(dplyr)

# Buat kategori order_priority
df_clean$order_priority <- ifelse(
  df_clean$net_sales > 1000000, "High",
  ifelse(df_clean$net_sales >= 500000, "Medium", "Low")
)

# Ringkasan jumlah
op_df <- df_clean %>%
  count(order_priority, name = "Jumlah")
library(plotly)

plot_ly(
  op_df,
  labels = ~order_priority,
  values = ~Jumlah,
  type = "pie",
  hole = 0.6,
  textinfo = "label+percent",
  showlegend = FALSE,
  
  marker = list(
    colors = c(
      "#d9ecff",
      "#5fa8ff",
      "#001f7a"
    )
  )
) %>%
  layout(
    title = "Distribusi Order Priority"
  )

Distribusi kolom order_priority:

library(knitr)
library(kableExtra)
library(dplyr)

kable(op_df, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
order_priority Jumlah
High 900
Low 1095
Medium 480
# ============================================================
# SECTION D — 3. Kolom valid_transaction
# ============================================================
library(dplyr)

# Validasi transaksi
df_clean$valid_transaction <- ifelse(
  df_clean$order_status == "Cancelled",
  "Invalid",
  "Valid"
)

# Ringkasan
vt_df <- df_clean %>%
  count(valid_transaction, name = "Jumlah")

# Preview data (5 baris pertama)
preview_df <- df_clean %>%
  select(order_id, platform, net_sales, order_status,
         is_high_value, order_priority, valid_transaction) %>%
  head(5)
library(plotly)

plot_ly(
  vt_df,
  labels = ~valid_transaction,
  values = ~Jumlah,
  type = "pie",
  hole = 0.6,
  textinfo = "label+percent",
  showlegend = FALSE,
  
  marker = list(
    colors = c(
      "#a8d1ff",
      "#001f7a"
    )
  )
) %>%
  layout(
    title = "Distribusi Valid Transaction"
  )

Distribusi kolom valid_transaction:

library(knitr)
library(kableExtra)
library(dplyr)

kable(vt_df, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
valid_transaction Jumlah
Invalid 323
Valid 2152

Preview dataset final (5 baris pertama):

library(knitr)
library(kableExtra)
library(dplyr)

kable(preview_df) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "bordered"),
    full_width = TRUE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  scroll_box(width = "100%")
order_id platform net_sales order_status is_high_value order_priority valid_transaction
ORD00612 Tokopedia 680058 Completed No Medium Valid
ORD00112 TikTok Shop 1476873 Completed Yes High Valid
ORD01186 Tokopedia 369715 Completed No Low Valid
ORD01511 Tokopedia 1382570 Completed Yes High Valid
ORD00772 Tokopedia 376696 Completed No Low Valid
# Simpan ke CSV
write.csv(df_clean, "ecommerce_cleaned.csv", row.names = FALSE)

# Info file hasil
save_info <- data.frame(
  Keterangan = c("File Output", "Total Baris", "Total Kolom"),
  Nilai = c("ecommerce_cleaned.csv", nrow(df_clean), ncol(df_clean)),
  stringsAsFactors = FALSE
)
library(knitr)
library(kableExtra)
library(dplyr)

kable(save_info, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#27ae60", color = "white")
Keterangan Nilai
File Output ecommerce_cleaned.csv
Total Baris 2475
Total Kolom 25

SECTION E — ANALYTICAL THINKING

Tujuan: Menarik insight sederhana dari data

# ============================================================
# SECTION E — Analytical Thinking
# ============================================================
library(dplyr)

# Platform
platform_vc <- df_clean %>%
  count(platform, name = "Jumlah_Transaksi") %>%
  rename(Platform = platform)

# Category
category_vc <- df_clean %>%
  count(category, name = "Jumlah_Transaksi") %>%
  rename(Category = category)

# Order Status
status_vc <- df_clean %>%
  count(order_status, name = "Jumlah_Transaksi") %>%
  rename(`Order Status` = order_status)

1. Distribusi transaksi per platform:

library(knitr)
library(kableExtra)
library(dplyr)

kable(platform_vc, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2980b9", color = "white") %>%
  row_spec(1, bold = TRUE, background = "#d5e8f5")
Platform Jumlah_Transaksi
Blibli 514
Lazada 486
Shopee 519
TikTok Shop 481
Tokopedia 475
library(plotly)
library(dplyr)

platform_vc <- df_clean %>%
  count(platform, name = "Jumlah_Transaksi") %>%
  arrange(desc(Jumlah_Transaksi))

platform_vc$platform <- factor(
  platform_vc$platform,
  levels = platform_vc$platform
)

plot_ly(
  data = platform_vc,
  x = ~platform,
  y = ~Jumlah_Transaksi,
  type = "bar",
  text = ~Jumlah_Transaksi,
  textposition = "outside",
  marker = list(
    color = c(
      "#d9ecff",
      "#a8d1ff",
      "#5fa8ff",
      "#1f5fd6",
      "#001f7a"
    )
  )
) %>%
  layout(
    title = "1. Distribusi Transaksi per Platform",
    xaxis = list(title = "Platform"),
    yaxis = list(title = "Jumlah Transaksi"),
    plot_bgcolor = "#f8f9fa"
  )

2. Distribusi transaksi per kategori:

library(knitr)
library(kableExtra)
library(dplyr)

kable(category_vc, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2980b9", color = "white") %>%
  row_spec(1, bold = TRUE, background = "#d5e8f5")
Category Jumlah_Transaksi
BEAUTY 10
Beauty 475
ELECTRONICS 15
Electronics 439
FASHION 36
Fashion 474
HOME LIVING 13
HOME_LIVING 2
Home Living 465
Home_Living 33
Home_living 2
SPORTS 12
Sports 499
library(plotly)
library(dplyr)

category_vc <- df_clean %>%
  count(category, name = "Jumlah_Transaksi") %>%
  arrange(desc(Jumlah_Transaksi))

category_vc$category <- factor(
  category_vc$category,
  levels = category_vc$category
)

plot_ly(
  data = category_vc,
  x = ~category,
  y = ~Jumlah_Transaksi,
  type = "bar",
  text = ~Jumlah_Transaksi,
  textposition = "outside",
  marker = list(
    color = c(
      "#d9ecff",
      "#b8dcff",
      "#8cc8ff",
      "#5fa8ff",
      "#3b82f6",
      "#1f5fd6",
      "#001f7a"
    )
  )
) %>%
  layout(
     title = "2. Distribusi Transaksi per Category",
    xaxis = list(title = "Category"),
    yaxis = list(title = "Jumlah Transaksi"),
    plot_bgcolor = "#f8f9fa"
  )

3. Distribusi transaksi per status order:

library(knitr)
library(kableExtra)
library(dplyr)

kable(status_vc, align = "lc") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "bordered"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2980b9", color = "white") %>%
  row_spec(1, bold = TRUE, background = "#d5e8f5")
Order Status Jumlah_Transaksi
Cancelled 323
Completed 1834
Returned 144
Shipped 174
library(plotly)
library(dplyr)

status_vc <- df_clean %>%
  count(order_status, name = "Jumlah_Transaksi") %>%
  arrange(desc(Jumlah_Transaksi))

status_vc$order_status <- factor(
  status_vc$order_status,
  levels = status_vc$order_status
)

plot_ly(
  data = status_vc,
  x = ~order_status,
  y = ~Jumlah_Transaksi,
  type = "bar",
  text = ~Jumlah_Transaksi,
  textposition = "outside",
  marker = list(
    color = c(
      "#d9ecff",
      "#8cc8ff",
      "#3b82f6",
      "#1f5fd6",
      "#001f7a"
    )
  )
) %>%
  layout(
    title = "3. Distribusi Transaksi per Order Status",
    xaxis = list(title = "Order Status"),
    yaxis = list(title = "Jumlah Transaksi"),
    plot_bgcolor = "#f8f9fa"
  )