Teknik: AJAX / JavaScript — Data diambil via endpoint JSON menggunakan httr + jsonlite
all_data <- data.frame()
for (year in 2010:2023) {
url <- paste0("https://www.scrapethissite.com/pages/ajax-javascript/?ajax=true&year=", year)
response <- GET(url, add_headers("User-Agent" = "Mozilla/5.0"))
content_text <- content(response, "text", encoding = "UTF-8")
tryCatch({
films <- fromJSON(content_text)
if (!is.null(films) && length(films) > 0) {
df <- as.data.frame(films)
all_data <- bind_rows(all_data, df)
cat("Tahun", year, ":", nrow(df), "films\n")
}
}, error = function(e) { cat("Error tahun", year, "\n") })
}## Tahun 2010 : 13 films
## Tahun 2011 : 15 films
## Tahun 2012 : 15 films
## Tahun 2013 : 12 films
## Tahun 2014 : 16 films
## Tahun 2015 : 16 films
##
## Total berhasil diambil: 87 baris
## Jumlah Baris : 87
## Jumlah Kolom : 5
## Nama Kolom : title, year, awards, nominations, best_picture
## Tipe Data:
## 'data.frame': 87 obs. of 5 variables:
## $ title : chr "The King's Speech" "Inception" "The Social Network" "The Fighter" ...
## $ year : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ awards : int 4 4 3 2 2 2 1 1 1 1 ...
## $ nominations : int 12 8 8 7 5 3 5 1 1 1 ...
## $ best_picture: logi TRUE NA NA NA NA NA ...
## NULL
##
## Missing Values:
## title year awards nominations best_picture
## 0 0 0 0 81
##
## Duplikat: 0 baris
##
## Data Issues:
## 1. Kolom best_picture bertipe logical (TRUE/FALSE), bukan teks kategori
## 2. Banyak nilai NA di kolom best_picture — perlu diisi nilai default
# Looping cleaning kolom teks
text_cols <- intersect(c("title", "award"), names(all_data))
for (col in text_cols) {
all_data[[col]] <- trimws(all_data[[col]])
all_data[[col]] <- tools::toTitleCase(tolower(all_data[[col]]))
cat("Kolom", col, "sudah di-trim dan proper case\n")
}## Kolom title sudah di-trim dan proper case
# Handle missing value dengan IF
for (col in names(all_data)) {
n_missing <- sum(is.na(all_data[[col]]))
if (n_missing > 0) {
if (is.logical(all_data[[col]])) {
all_data[[col]][is.na(all_data[[col]])] <- FALSE
cat("", col, "→ diisi FALSE\n")
} else if (is.numeric(all_data[[col]])) {
all_data[[col]][is.na(all_data[[col]])] <- 0
cat("", col, "→ diisi 0\n")
} else {
all_data[[col]][is.na(all_data[[col]])] <- "Unknown"
cat("", col, "→ diisi Unknown\n")
}
}
}## best_picture → diisi FALSE
# Ubah tipe data
all_data$year <- as.integer(all_data$year)
all_data$awards <- as.integer(all_data$awards)
all_data$nominations <- as.integer(all_data$nominations)
cat("Tipe data year, awards, nominations → integer\n")## Tipe data year, awards, nominations → integer
before <- nrow(all_data)
all_data <- distinct(all_data)
cat(" Duplikat dihapus:", before - nrow(all_data), "baris\n")## Duplikat dihapus: 0 baris
## Dataset bersih: 87 baris
all_data$data_status <- apply(all_data, 1, function(row) {
if (any(is.na(row)) || any(row == "")) "Incomplete" else "Complete"
})
cat("Data Status:\n")## Data Status:
##
## Complete
## 87
write.csv(all_data, "oscar_films.csv", row.names = FALSE)
cat("\n Disimpan ke oscar_films.csv! Total", nrow(all_data), "baris\n")##
## Disimpan ke oscar_films.csv! Total 87 baris
Preview Data Oscar Films:
| title | year | awards | nominations | best_picture | data_status |
|---|---|---|---|---|---|
| The King’s Speech | 2010 | 4 | 12 | TRUE | Complete |
| Inception | 2010 | 4 | 8 | FALSE | Complete |
| The Social Network | 2010 | 3 | 8 | FALSE | Complete |
| The Fighter | 2010 | 2 | 7 | FALSE | Complete |
| Toy Story 3 | 2010 | 2 | 5 | FALSE | Complete |
| Alice in Wonderland | 2010 | 2 | 3 | FALSE | Complete |
| Black Swan | 2010 | 1 | 5 | FALSE | Complete |
| In a Better World | 2010 | 1 | 1 | FALSE | Complete |
| The Lost Thing | 2010 | 1 | 1 | FALSE | Complete |
| God of Love | 2010 | 1 | 1 | FALSE | Complete |
Teknik: iFrame — Akses langsung ke URL sumber frame menggunakan rvest
iframe_url <- "https://www.scrapethissite.com/pages/frames/?frame=i"
page <- read_html(iframe_url)
cards <- page %>% html_elements(".turtle-family-card")
cat("Total cards ditemukan:", length(cards), "\n")## Total cards ditemukan: 14
turtles_data <- data.frame()
for (card in cards) {
name <- card %>% html_element(".family-name") %>% html_text(trim = TRUE)
if (is.null(name) || length(name) == 0) name <- "Unknown"
turtles_data <- bind_rows(turtles_data, data.frame(family_name = name))
}
cat("Total berhasil diambil:", nrow(turtles_data), "baris")## Total berhasil diambil: 14 baris
## Jumlah Baris : 14
## Jumlah Kolom : 1
## Nama Kolom : family_name
## Tipe Data:
## 'data.frame': 14 obs. of 1 variable:
## $ family_name: chr "Carettochelyidae" "Cheloniidae" "Chelydridae" "Dermatemydidae" ...
## NULL
##
## Missing Values:
## family_name
## 0
##
## Duplikat: 0
##
## Data Issues:
## 1. Data hanya 1 kolom — informasi tambahan tidak tersedia
## 2. Nama famili mungkin mengandung spasi berlebih
for (col in names(turtles_data)) {
turtles_data[[col]] <- trimws(turtles_data[[col]])
turtles_data[[col]] <- tools::toTitleCase(tolower(turtles_data[[col]]))
cat("Kolom", col, "sudah di-trim dan proper case\n")
}## Kolom family_name sudah di-trim dan proper case
for (col in names(turtles_data)) {
if (sum(is.na(turtles_data[[col]])) > 0) {
turtles_data[[col]][is.na(turtles_data[[col]])] <- "Unknown"
cat("Missing value di", col, "diisi Unknown\n")
}
}
before <- nrow(turtles_data)
turtles_data <- distinct(turtles_data)
cat(" Duplikat dihapus:", before - nrow(turtles_data), "baris\n")## Duplikat dihapus: 0 baris
## Dataset bersih: 14 baris
turtles_data$data_status <- ifelse(
is.na(turtles_data$family_name) | turtles_data$family_name == "" | turtles_data$family_name == "Unknown",
"Incomplete", "Complete"
)
cat("Data Status:\n")## Data Status:
##
## Complete
## 14
write.csv(turtles_data, "turtles.csv", row.names = FALSE)
cat("\n Disimpan ke turtles.csv! Total", nrow(turtles_data), "baris\n")##
## Disimpan ke turtles.csv! Total 14 baris
Preview Data Turtles:
| family_name | data_status |
|---|---|
| Carettochelyidae | Complete |
| Cheloniidae | Complete |
| Chelydridae | Complete |
| Dermatemydidae | Complete |
| Dermochelyidae | Complete |
| Emydidae | Complete |
| Geoemydidae | Complete |
| Kinosternidae | Complete |
| Platysternidae | Complete |
| Testudinidae | Complete |
| Trionychidae | Complete |
| Chelidae | Complete |
| Pelomedusidae | Complete |
| Podocnemididae | Complete |
| Pendekatan | Karakteristik | Tools | Contoh |
|---|---|---|---|
| Static | Data langsung ada di HTML saat halaman dimuat | requests + BS4 / rvest | Countries |
| Pagination | Data tersebar di banyak halaman dengan parameter page di URL | requests + looping page | Hockey Teams |
| AJAX | Data dimuat JavaScript setelah halaman terbuka — perlu cari endpoint API | httr + fromJSON | Oscar Films |
| iFrame | Konten ada di frame terpisah — akses URL iframe langsung | rvest + URL frame | Turtles |
## Top 5 Film Nominasi Terbanyak:
| title | year | nominations | awards |
|---|---|---|---|
| The King’s Speech | 2010 | 12 | 4 |
| Lincoln | 2012 | 12 | 2 |
| The Revenant | 2015 | 12 | 3 |
| Hugo | 2011 | 11 | 5 |
| Life of Pi | 2012 | 11 | 4 |
Rekomendasi 1:
Gunakan browser DevTools (Network tab)
untuk temukan endpoint AJAX sebelum menulis kode — lebih efisien dari
Selenium
Rekomendasi 2:
Tambahkan
Sys.sleep() antar request saat scraping banyak halaman
untuk hindari pemblokiran IP
Rekomendasi 3:
Selalu
simpan data mentah sebelum cleaning agar tidak perlu scraping ulang jika
cleaning gagal