Website 3 — Oscar Winning Films

Teknik: AJAX / JavaScript — Data diambil via endpoint JSON menggunakan httr + jsonlite

Section A — Data Collection

all_data <- data.frame()

for (year in 2010:2023) {
  url <- paste0("https://www.scrapethissite.com/pages/ajax-javascript/?ajax=true&year=", year)
  response <- GET(url, add_headers("User-Agent" = "Mozilla/5.0"))
  content_text <- content(response, "text", encoding = "UTF-8")
  tryCatch({
    films <- fromJSON(content_text)
    if (!is.null(films) && length(films) > 0) {
      df <- as.data.frame(films)
      all_data <- bind_rows(all_data, df)
      cat("Tahun", year, ":", nrow(df), "films\n")
    }
  }, error = function(e) { cat("Error tahun", year, "\n") })
}
## Tahun 2010 : 13 films
## Tahun 2011 : 15 films
## Tahun 2012 : 15 films
## Tahun 2013 : 12 films
## Tahun 2014 : 16 films
## Tahun 2015 : 16 films
cat("\n Total berhasil diambil:", nrow(all_data), "baris")
## 
##  Total berhasil diambil: 87 baris
87
Total Baris
5
Total Kolom
2010–2023
Rentang Tahun

Section B — Data Handling

cat("Jumlah Baris  :", nrow(all_data), "\n")
## Jumlah Baris  : 87
cat("Jumlah Kolom  :", ncol(all_data), "\n")
## Jumlah Kolom  : 5
cat("Nama Kolom    :", paste(names(all_data), collapse = ", "), "\n\n")
## Nama Kolom    : title, year, awards, nominations, best_picture
cat("Tipe Data:\n"); print(str(all_data))
## Tipe Data:
## 'data.frame':    87 obs. of  5 variables:
##  $ title       : chr  "The King's Speech" "Inception" "The Social Network" "The Fighter" ...
##  $ year        : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ awards      : int  4 4 3 2 2 2 1 1 1 1 ...
##  $ nominations : int  12 8 8 7 5 3 5 1 1 1 ...
##  $ best_picture: logi  TRUE NA NA NA NA NA ...
## NULL
cat("\nMissing Values:\n"); print(colSums(is.na(all_data)))
## 
## Missing Values:
##        title         year       awards  nominations best_picture 
##            0            0            0            0           81
cat("\nDuplikat:", sum(duplicated(all_data)), "baris\n")
## 
## Duplikat: 0 baris
cat("\n Data Issues:\n")
## 
##  Data Issues:
cat("1. Kolom best_picture bertipe logical (TRUE/FALSE), bukan teks kategori\n")
## 1. Kolom best_picture bertipe logical (TRUE/FALSE), bukan teks kategori
cat("2. Banyak nilai NA di kolom best_picture — perlu diisi nilai default\n")
## 2. Banyak nilai NA di kolom best_picture — perlu diisi nilai default

Section C — Data Cleaning

# Looping cleaning kolom teks
text_cols <- intersect(c("title", "award"), names(all_data))
for (col in text_cols) {
  all_data[[col]] <- trimws(all_data[[col]])
  all_data[[col]] <- tools::toTitleCase(tolower(all_data[[col]]))
  cat("Kolom", col, "sudah di-trim dan proper case\n")
}
## Kolom title sudah di-trim dan proper case
# Handle missing value dengan IF
for (col in names(all_data)) {
  n_missing <- sum(is.na(all_data[[col]]))
  if (n_missing > 0) {
    if (is.logical(all_data[[col]])) {
      all_data[[col]][is.na(all_data[[col]])] <- FALSE
      cat("", col, "→ diisi FALSE\n")
    } else if (is.numeric(all_data[[col]])) {
      all_data[[col]][is.na(all_data[[col]])] <- 0
      cat("", col, "→ diisi 0\n")
    } else {
      all_data[[col]][is.na(all_data[[col]])] <- "Unknown"
      cat("", col, "→ diisi Unknown\n")
    }
  }
}
##  best_picture → diisi FALSE
# Ubah tipe data
all_data$year        <- as.integer(all_data$year)
all_data$awards      <- as.integer(all_data$awards)
all_data$nominations <- as.integer(all_data$nominations)
cat("Tipe data year, awards, nominations → integer\n")
## Tipe data year, awards, nominations → integer
before <- nrow(all_data)
all_data <- distinct(all_data)
cat(" Duplikat dihapus:", before - nrow(all_data), "baris\n")
##  Duplikat dihapus: 0 baris
cat(" Dataset bersih:", nrow(all_data), "baris\n")
##  Dataset bersih: 87 baris

Section D — Conditional Logic

all_data$data_status <- apply(all_data, 1, function(row) {
  if (any(is.na(row)) || any(row == "")) "Incomplete" else "Complete"
})

cat("Data Status:\n")
## Data Status:
print(table(all_data$data_status))
## 
## Complete 
##       87
write.csv(all_data, "oscar_films.csv", row.names = FALSE)
cat("\n Disimpan ke oscar_films.csv! Total", nrow(all_data), "baris\n")
## 
##  Disimpan ke oscar_films.csv! Total 87 baris

Preview Data Oscar Films:

title year awards nominations best_picture data_status
The King’s Speech 2010 4 12 TRUE Complete
Inception 2010 4 8 FALSE Complete
The Social Network 2010 3 8 FALSE Complete
The Fighter 2010 2 7 FALSE Complete
Toy Story 3 2010 2 5 FALSE Complete
Alice in Wonderland 2010 2 3 FALSE Complete
Black Swan 2010 1 5 FALSE Complete
In a Better World 2010 1 1 FALSE Complete
The Lost Thing 2010 1 1 FALSE Complete
God of Love 2010 1 1 FALSE Complete

Website 4 — Turtles (iFrame)

Teknik: iFrame — Akses langsung ke URL sumber frame menggunakan rvest

Section A — Data Collection

iframe_url <- "https://www.scrapethissite.com/pages/frames/?frame=i"
page <- read_html(iframe_url)
cards <- page %>% html_elements(".turtle-family-card")
cat("Total cards ditemukan:", length(cards), "\n")
## Total cards ditemukan: 14
turtles_data <- data.frame()
for (card in cards) {
  name <- card %>% html_element(".family-name") %>% html_text(trim = TRUE)
  if (is.null(name) || length(name) == 0) name <- "Unknown"
  turtles_data <- bind_rows(turtles_data, data.frame(family_name = name))
}

cat("Total berhasil diambil:", nrow(turtles_data), "baris")
## Total berhasil diambil: 14 baris
14
Total Famili
1
Total Kolom
iFrame
Teknik Scraping

Section B — Data Handling

cat("Jumlah Baris  :", nrow(turtles_data), "\n")
## Jumlah Baris  : 14
cat("Jumlah Kolom  :", ncol(turtles_data), "\n")
## Jumlah Kolom  : 1
cat("Nama Kolom    :", paste(names(turtles_data), collapse = ", "), "\n\n")
## Nama Kolom    : family_name
cat("Tipe Data:\n"); print(str(turtles_data))
## Tipe Data:
## 'data.frame':    14 obs. of  1 variable:
##  $ family_name: chr  "Carettochelyidae" "Cheloniidae" "Chelydridae" "Dermatemydidae" ...
## NULL
cat("\nMissing Values:\n"); print(colSums(is.na(turtles_data)))
## 
## Missing Values:
## family_name 
##           0
cat("\nDuplikat:", sum(duplicated(turtles_data)), "\n")
## 
## Duplikat: 0
cat("\n Data Issues:\n")
## 
##  Data Issues:
cat("1. Data hanya 1 kolom — informasi tambahan tidak tersedia\n")
## 1. Data hanya 1 kolom — informasi tambahan tidak tersedia
cat("2. Nama famili mungkin mengandung spasi berlebih\n")
## 2. Nama famili mungkin mengandung spasi berlebih

Section C — Data Cleaning

for (col in names(turtles_data)) {
  turtles_data[[col]] <- trimws(turtles_data[[col]])
  turtles_data[[col]] <- tools::toTitleCase(tolower(turtles_data[[col]]))
  cat("Kolom", col, "sudah di-trim dan proper case\n")
}
## Kolom family_name sudah di-trim dan proper case
for (col in names(turtles_data)) {
  if (sum(is.na(turtles_data[[col]])) > 0) {
    turtles_data[[col]][is.na(turtles_data[[col]])] <- "Unknown"
    cat("Missing value di", col, "diisi Unknown\n")
  }
}

before <- nrow(turtles_data)
turtles_data <- distinct(turtles_data)
cat(" Duplikat dihapus:", before - nrow(turtles_data), "baris\n")
##  Duplikat dihapus: 0 baris
cat(" Dataset bersih:", nrow(turtles_data), "baris\n")
##  Dataset bersih: 14 baris

Section D — Conditional Logic

turtles_data$data_status <- ifelse(
  is.na(turtles_data$family_name) | turtles_data$family_name == "" | turtles_data$family_name == "Unknown",
  "Incomplete", "Complete"
)
cat("Data Status:\n")
## Data Status:
print(table(turtles_data$data_status))
## 
## Complete 
##       14
write.csv(turtles_data, "turtles.csv", row.names = FALSE)
cat("\n Disimpan ke turtles.csv! Total", nrow(turtles_data), "baris\n")
## 
##  Disimpan ke turtles.csv! Total 14 baris

Preview Data Turtles:

family_name data_status
Carettochelyidae Complete
Cheloniidae Complete
Chelydridae Complete
Dermatemydidae Complete
Dermochelyidae Complete
Emydidae Complete
Geoemydidae Complete
Kinosternidae Complete
Platysternidae Complete
Testudinidae Complete
Trionychidae Complete
Chelidae Complete
Pelomedusidae Complete
Podocnemididae Complete

Section E — Analytical Thinking

Tingkat Kesulitan Scraping

🟢 Paling Mudah: Countries of the World — HTML statis, data langsung tersedia tanpa JavaScript
🔴 Paling Sulit: Oscar Winning Films — Data dimuat via AJAX, perlu inspect Network tab untuk temukan endpoint JSON

Perbedaan Pendekatan

Pendekatan Karakteristik Tools Contoh
Static Data langsung ada di HTML saat halaman dimuat requests + BS4 / rvest Countries
Pagination Data tersebar di banyak halaman dengan parameter page di URL requests + looping page Hockey Teams
AJAX Data dimuat JavaScript setelah halaman terbuka — perlu cari endpoint API httr + fromJSON Oscar Films
iFrame Konten ada di frame terpisah — akses URL iframe langsung rvest + URL frame Turtles

Insights & Rekomendasi

##  Top 5 Film Nominasi Terbanyak:
Top 5 Film Oscar dengan Nominasi Terbanyak
title year nominations awards
The King’s Speech 2010 12 4
Lincoln 2012 12 2
The Revenant 2015 12 3
Hugo 2011 11 5
Life of Pi 2012 11 4
Insight 1: Film dengan nominasi terbanyak tidak selalu menang Best Picture
Insight 2: Data Oscar 2010-2023 menunjukkan rata-rata 6 film per tahun masuk nominasi
Insight 3: Seluruh 14 famili kura-kura berhasil diambil lengkap dari iFrame

Rekomendasi 1:
Gunakan browser DevTools (Network tab) untuk temukan endpoint AJAX sebelum menulis kode — lebih efisien dari Selenium

Rekomendasi 2:
Tambahkan Sys.sleep() antar request saat scraping banyak halaman untuk hindari pemblokiran IP

Rekomendasi 3:
Selalu simpan data mentah sebelum cleaning agar tidak perlu scraping ulang jika cleaning gagal