Putri Adria Garini
NIM: 52250002Hirose Kawarin Sirait
NIM: 52250012Cecilia Mutiara Handayani
NIM: 52250013Teknik: AJAX / JavaScript — Data diambil via endpoint JSON menggunakan httr + jsonlite
all_data <- data.frame()
for (year in 2010:2023) {
url <- paste0("https://www.scrapethissite.com/pages/ajax-javascript/?ajax=true&year=", year)
response <- GET(url, add_headers("User-Agent" = "Mozilla/5.0"))
content_text <- content(response, "text", encoding = "UTF-8")
tryCatch({
films <- fromJSON(content_text)
if (!is.null(films) && length(films) > 0) {
df <- as.data.frame(films)
all_data <- bind_rows(all_data, df)
cat("Tahun", year, ":", nrow(df), "films\n")
}
}, error = function(e) { cat("Error tahun", year, "\n") })
}## Tahun 2010 : 13 films
## Tahun 2011 : 15 films
## Tahun 2012 : 15 films
## Tahun 2013 : 12 films
## Tahun 2014 : 16 films
## Tahun 2015 : 16 films
##
## Total berhasil diambil: 87 baris
## Kolom: title, year, awards, nominations, best_picture
## Jumlah Baris : 87
## Jumlah Kolom : 5
## Nama Kolom : title, year, awards, nominations, best_picture
## Tipe Data:
## 'data.frame': 87 obs. of 5 variables:
## $ title : chr "The King's Speech" "Inception" "The Social Network" "The Fighter" ...
## $ year : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ awards : int 4 4 3 2 2 2 1 1 1 1 ...
## $ nominations : int 12 8 8 7 5 3 5 1 1 1 ...
## $ best_picture: logi TRUE NA NA NA NA NA ...
## NULL
##
## Missing Values:
## title year awards nominations best_picture
## 0 0 0 0 81
##
## Duplikat: 0 baris
##
## Data Issues:
## 1. Kolom best_picture bertipe logical (TRUE/FALSE), bukan teks kategori
## 2. Banyak nilai NA di kolom best_picture — perlu diisi nilai default
# LOOPING: cleaning kolom teks sekaligus
text_cols <- intersect(c("title", "award"), names(all_data))
for (col in text_cols) {
all_data[[col]] <- trimws(all_data[[col]])
all_data[[col]] <- tools::toTitleCase(tolower(all_data[[col]]))
cat("Kolom", col, "sudah di-trim dan proper case\n")
}## Kolom title sudah di-trim dan proper case
# IF: handle missing value per tipe data
for (col in names(all_data)) {
n_missing <- sum(is.na(all_data[[col]]))
if (n_missing > 0) {
if (is.logical(all_data[[col]])) {
all_data[[col]][is.na(all_data[[col]])] <- FALSE
cat("", col, "-> diisi FALSE (logika: best_picture kosong = tidak menang)\n")
} else if (is.numeric(all_data[[col]])) {
all_data[[col]][is.na(all_data[[col]])] <- 0
cat("", col, "-> diisi 0\n")
} else {
all_data[[col]][is.na(all_data[[col]])] <- "Unknown"
cat("", col, "-> diisi Unknown\n")
}
}
}## best_picture -> diisi FALSE (logika: best_picture kosong = tidak menang)
# Ubah tipe data
all_data$year <- as.integer(all_data$year)
all_data$awards <- as.integer(all_data$awards)
all_data$nominations <- as.integer(all_data$nominations)
cat("\nTipe data year, awards, nominations -> integer\n")##
## Tipe data year, awards, nominations -> integer
before <- nrow(all_data)
all_data <- distinct(all_data)
cat("Duplikat dihapus:", before - nrow(all_data), "baris\n")## Duplikat dihapus: 0 baris
## Dataset bersih: 87 baris
# Kondisi 1: jika elemen tidak ditemukan -> sudah ditangani saat scraping (default FALSE/0/Unknown)
# Kondisi 2: jika data tidak lengkap -> tandai "Incomplete"
# Kondisi 3: jika data valid/lengkap -> tandai "Complete"
all_data$data_status <- apply(all_data, 1, function(row) {
has_missing <- any(is.na(row)) || any(row == "") || any(row == "Unknown")
if (has_missing) {
"Incomplete"
} else {
"Complete"
}
})
cat("Data Status:\n")## Data Status:
##
## Complete
## 87
write.csv(all_data, "oscar_films.csv", row.names = FALSE)
cat("\nDisimpan ke oscar_films.csv! Total", nrow(all_data), "baris\n")##
## Disimpan ke oscar_films.csv! Total 87 baris
Preview Data Oscar Films:
| title | year | awards | nominations | best_picture | data_status |
|---|---|---|---|---|---|
| The King’s Speech | 2010 | 4 | 12 | TRUE | Complete |
| Inception | 2010 | 4 | 8 | FALSE | Complete |
| The Social Network | 2010 | 3 | 8 | FALSE | Complete |
| The Fighter | 2010 | 2 | 7 | FALSE | Complete |
| Toy Story 3 | 2010 | 2 | 5 | FALSE | Complete |
| Alice in Wonderland | 2010 | 2 | 3 | FALSE | Complete |
| Black Swan | 2010 | 1 | 5 | FALSE | Complete |
| In a Better World | 2010 | 1 | 1 | FALSE | Complete |
| The Lost Thing | 2010 | 1 | 1 | FALSE | Complete |
| God of Love | 2010 | 1 | 1 | FALSE | Complete |
Teknik: iFrame — Akses langsung ke URL sumber frame menggunakan rvest
iframe_url <- "https://www.scrapethissite.com/pages/frames/?frame=i"
page <- read_html(iframe_url)
cards <- page %>% html_elements(".turtle-family-card")
cat("Total cards ditemukan:", length(cards), "\n")## Total cards ditemukan: 14
turtles_data <- data.frame()
# LOOPING: iterasi setiap card untuk ambil 3 kolom (name, description, additional_info)
for (card in cards) {
# Ambil family name
name <- card %>% html_element(".family-name") %>% html_text(trim = TRUE)
if (is.null(name) || length(name) == 0 || is.na(name)) name <- "Unknown"
# Ambil description
description <- card %>% html_element(".family-bio") %>% html_text(trim = TRUE)
if (is.null(description) || length(description) == 0 || is.na(description)) description <- "No description available"
# Ambil additional info (jumlah spesies / info tambahan)
additional_info <- card %>% html_element(".family-num-species") %>% html_text(trim = TRUE)
if (is.null(additional_info) || length(additional_info) == 0 || is.na(additional_info)) additional_info <- "Unknown"
turtles_data <- bind_rows(turtles_data, data.frame(
family_name = name,
description = description,
additional_info = additional_info,
stringsAsFactors = FALSE
))
}
cat("Total berhasil diambil:", nrow(turtles_data), "baris\n")## Total berhasil diambil: 14 baris
## Kolom: family_name, description, additional_info
family_name,
description, additional_info — memenuhi
requirement minimal 3 kolom (name, description, additional info)
## Jumlah Baris : 14
## Jumlah Kolom : 3
## Nama Kolom : family_name, description, additional_info
## Tipe Data:
## 'data.frame': 14 obs. of 3 variables:
## $ family_name : chr "Carettochelyidae" "Cheloniidae" "Chelydridae" "Dermatemydidae" ...
## $ description : chr "No description available" "No description available" "No description available" "No description available" ...
## $ additional_info: chr "Unknown" "Unknown" "Unknown" "Unknown" ...
## NULL
##
## Missing Values:
## family_name description additional_info
## 0 0 0
##
## Duplikat: 0
##
## Data Issues:
cat("1. Kolom description bisa sangat panjang dan mengandung newline/spasi berlebih — perlu di-trim\n")## 1. Kolom description bisa sangat panjang dan mengandung newline/spasi berlebih — perlu di-trim
cat("2. Kolom additional_info mungkin mengandung teks campuran (misal: '14 species') — perlu distandarisasi\n")## 2. Kolom additional_info mungkin mengandung teks campuran (misal: '14 species') — perlu distandarisasi
# LOOPING: cleaning 3 kolom sekaligus
text_cols_turtle <- c("family_name", "description", "additional_info")
for (col in text_cols_turtle) {
turtles_data[[col]] <- trimws(turtles_data[[col]]) # trim spasi
turtles_data[[col]] <- gsub("\\s+", " ", turtles_data[[col]]) # hapus newline/spasi ganda
if (col == "family_name") {
turtles_data[[col]] <- tools::toTitleCase(tolower(turtles_data[[col]])) # proper case untuk nama
}
cat("Kolom", col, "sudah di-clean\n")
}## Kolom family_name sudah di-clean
## Kolom description sudah di-clean
## Kolom additional_info sudah di-clean
# IF: handle missing value dengan logika berbeda per kolom
for (col in names(turtles_data)) {
n_missing <- sum(is.na(turtles_data[[col]]) | turtles_data[[col]] == "")
if (n_missing > 0) {
if (col == "family_name") {
turtles_data[[col]][is.na(turtles_data[[col]]) | turtles_data[[col]] == ""] <- "Unknown"
cat("Missing di family_name -> diisi 'Unknown'\n")
} else if (col == "description") {
turtles_data[[col]][is.na(turtles_data[[col]]) | turtles_data[[col]] == ""] <- "No description available"
cat("Missing di description -> diisi 'No description available'\n")
} else {
turtles_data[[col]][is.na(turtles_data[[col]]) | turtles_data[[col]] == ""] <- "Unknown"
cat("Missing di", col, "-> diisi 'Unknown'\n")
}
}
}
before <- nrow(turtles_data)
turtles_data <- distinct(turtles_data)
cat("Duplikat dihapus:", before - nrow(turtles_data), "baris\n")## Duplikat dihapus: 0 baris
## Dataset bersih: 14 baris
# Kondisi 1: jika elemen tidak ditemukan -> sudah diberi default saat scraping
# Kondisi 2: jika data tidak lengkap -> tandai "Incomplete"
# Kondisi 3: jika data valid -> tandai "Complete"
turtles_data$data_status <- ifelse(
is.na(turtles_data$family_name) |
turtles_data$family_name == "" |
turtles_data$family_name == "Unknown" |
turtles_data$description == "No description available",
"Incomplete", "Complete"
)
cat("Data Status:\n")## Data Status:
##
## Incomplete
## 14
write.csv(turtles_data, "turtles.csv", row.names = FALSE)
cat("\nDisimpan ke turtles.csv! Total", nrow(turtles_data), "baris\n")##
## Disimpan ke turtles.csv! Total 14 baris
Preview Data Turtles:
| family_name | description | additional_info | data_status |
|---|---|---|---|
| Carettochelyidae | No description available | Unknown | Incomplete |
| Cheloniidae | No description available | Unknown | Incomplete |
| Chelydridae | No description available | Unknown | Incomplete |
| Dermatemydidae | No description available | Unknown | Incomplete |
| Dermochelyidae | No description available | Unknown | Incomplete |
| Emydidae | No description available | Unknown | Incomplete |
| Geoemydidae | No description available | Unknown | Incomplete |
| Kinosternidae | No description available | Unknown | Incomplete |
| Platysternidae | No description available | Unknown | Incomplete |
| Testudinidae | No description available | Unknown | Incomplete |
| Trionychidae | No description available | Unknown | Incomplete |
| Chelidae | No description available | Unknown | Incomplete |
| Pelomedusidae | No description available | Unknown | Incomplete |
| Podocnemididae | No description available | Unknown | Incomplete |
| Pendekatan | Karakteristik | Tools | Contoh |
|---|---|---|---|
| Static | Data langsung ada di HTML saat halaman dimuat | requests + BS4 / rvest | Countries |
| Pagination | Data tersebar di banyak halaman dengan parameter page di URL | requests + looping page | Hockey Teams |
| AJAX | Data dimuat JavaScript setelah halaman terbuka — perlu cari endpoint API | httr + fromJSON | Oscar Films |
| iFrame | Konten ada di frame terpisah — akses URL iframe langsung | rvest + URL frame | Turtles |
## Top 5 Film Nominasi Terbanyak:
| title | year | nominations | awards |
|---|---|---|---|
| The King’s Speech | 2010 | 12 | 4 |
| Lincoln | 2012 | 12 | 2 |
| The Revenant | 2015 | 12 | 3 |
| Hugo | 2011 | 11 | 5 |
| Life of Pi | 2012 | 11 | 4 |
## Rata-rata film per tahun: 14.5
## Total famili kura-kura: 14
mean(table(oscar$year)))
Rekomendasi 1:
Gunakan browser DevTools (Network tab)
untuk temukan endpoint AJAX sebelum menulis kode — lebih efisien dari
Selenium
Rekomendasi 2:
Tambahkan
Sys.sleep() antar request saat scraping banyak halaman
untuk hindari pemblokiran IP
Rekomendasi 3:
Selalu
simpan data mentah sebelum cleaning agar tidak perlu scraping ulang jika
cleaning gagal