library(rvest)
library(dplyr)
library(purrr)
library(stringr)
library(robotstxt)
Question 5
# Check if scraping is allowed
paths_allowed("https://en.wikipedia.org/wiki/2026_in_film")
## Warning: package 'future' was built under R version 4.4.3
## en.wikipedia.org
## [1] TRUE
Question 6
url_2026 <- "https://en.wikipedia.org/wiki/2026_in_film"
page_2026 <- read_html(url_2026)
tables_2026 <- page_2026 %>%
html_elements("table") %>%
map(html_table, fill = TRUE)
hg_2026 <- tables_2026 %>%
keep(~ any(str_detect(names(.x), regex("Worldwide\\s*gross", ignore_case = TRUE)))) %>%
pluck(1)
# Check if table was successfully scraped
if (nrow(hg_2026) > 0) {
print("True")
} else {
print("False")
}
## [1] "True"
Question 8
# Scrape Highest-Grossing Films from 2020–2025 and combine
years <- 2020:2025
get_hg <- function(y) {
url <- paste0("https://en.wikipedia.org/wiki/", y, "_in_film")
page <- read_html(url)
tables <- page %>%
html_elements("table") %>%
map(html_table, fill = TRUE)
hg <- tables %>%
keep(~ any(str_detect(names(.x), regex("Worldwide\\s*gross", ignore_case = TRUE)))) %>%
pluck(1)
hg %>% mutate(year = y)
}
hg_films <- map_dfr(years, get_hg)
hg_films
## # A tibble: 60 × 5
## Rank Title Distributor `Worldwide gross` year
## <int> <chr> <chr> <chr> <int>
## 1 1 Demon Slayer: Kimetsu no Yaiba Mug… Toho / Ani… $507,127,293[4] 2020
## 2 2 The Eight Hundred CMC Pictur… $461,421,559 2020
## 3 3 My People, My Homeland China Lion $433,241,288[5] 2020
## 4 4 Bad Boys for Life Sony $426,505,244 2020
## 5 5 Tenet Warner Bro… $365,309,519 2020
## 6 6 Sonic the Hedgehog Paramount $319,715,683 2020
## 7 7 Dolittle Universal $251,410,631 2020
## 8 8 Jiang Ziya Beijing En… $243,883,429 2020
## 9 9 A Little Red Flower HG Enterta… $238,600,000[6][… 2020
## 10 10 Shock Wave 2 Universe F… $226,400,000 2020
## # ℹ 50 more rows