library(rvest)
library(dplyr)
library(purrr)
library(stringr)
library(robotstxt)

Question 5

# Check if scraping is allowed
paths_allowed("https://en.wikipedia.org/wiki/2026_in_film")
## Warning: package 'future' was built under R version 4.4.3
##  en.wikipedia.org
## [1] TRUE

Question 6

url_2026 <- "https://en.wikipedia.org/wiki/2026_in_film"
page_2026 <- read_html(url_2026)

tables_2026 <- page_2026 %>%
  html_elements("table") %>%
  map(html_table, fill = TRUE)

hg_2026 <- tables_2026 %>%
  keep(~ any(str_detect(names(.x), regex("Worldwide\\s*gross", ignore_case = TRUE)))) %>%
  pluck(1)

# Check if table was successfully scraped
if (nrow(hg_2026) > 0) {
  print("True")
} else {
  print("False")
}
## [1] "True"

Question 8

# Scrape Highest-Grossing Films from 2020–2025 and combine

years <- 2020:2025

get_hg <- function(y) {
  url <- paste0("https://en.wikipedia.org/wiki/", y, "_in_film")
  page <- read_html(url)
  
  tables <- page %>%
    html_elements("table") %>%
    map(html_table, fill = TRUE)
  
  hg <- tables %>%
    keep(~ any(str_detect(names(.x), regex("Worldwide\\s*gross", ignore_case = TRUE)))) %>%
    pluck(1)
  
  hg %>% mutate(year = y)
}

hg_films <- map_dfr(years, get_hg)

hg_films
## # A tibble: 60 × 5
##     Rank Title                               Distributor `Worldwide gross`  year
##    <int> <chr>                               <chr>       <chr>             <int>
##  1     1 Demon Slayer: Kimetsu no Yaiba Mug… Toho / Ani… $507,127,293[4]    2020
##  2     2 The Eight Hundred                   CMC Pictur… $461,421,559       2020
##  3     3 My People, My Homeland              China Lion  $433,241,288[5]    2020
##  4     4 Bad Boys for Life                   Sony        $426,505,244       2020
##  5     5 Tenet                               Warner Bro… $365,309,519       2020
##  6     6 Sonic the Hedgehog                  Paramount   $319,715,683       2020
##  7     7 Dolittle                            Universal   $251,410,631       2020
##  8     8 Jiang Ziya                          Beijing En… $243,883,429       2020
##  9     9 A Little Red Flower                 HG Enterta… $238,600,000[6][…  2020
## 10    10 Shock Wave 2                        Universe F… $226,400,000       2020
## # ℹ 50 more rows