Setup

library(rvest)
library(dplyr)
library(purrr)
library(robotstxt)
library(stringr)
url_2026 <- "https://en.wikipedia.org/wiki/2026_in_film"

paths_allowed(url_2026)
## Warning: package 'future' was built under R version 4.4.3
##  en.wikipedia.org
## [1] TRUE
url_2026 <- "https://en.wikipedia.org/wiki/2026_in_film"

page_2026 <- read_html(url_2026)

tables_2026 <- page_2026 %>% 
  html_elements("table")

captions_2026 <- tables_2026 %>% 
  html_element("caption") %>% 
  html_text()

hg_index <- which(str_detect(captions_2026, "Highest-grossing films"))

hg_2026 <- tables_2026[[hg_index]] %>% 
  html_table(fill = TRUE)

print(hg_2026)
## # A tibble: 10 × 4
##     Rank Title                             Distributor         `Worldwide gross`
##    <int> <chr>                             <chr>               <chr>            
##  1     1 Cheburashka 2 †                   Central Partnership $79,559,272      
##  2     2 28 Years Later: The Bone Temple † Sony                $56,722,595      
##  3     3 Send Help †                       20th Century Studi… $54,977,893      
##  4     4 Border 2 †                        AA Films            $51,204,000[3]   
##  5     5 Mercy                             Amazon MGM Studios… $49,802,465      
##  6     6 Primate †                         Paramount Pictures  $39,702,818      
##  7     7 Iron Lung †                       Markiplier Studios  $37,974,493      
##  8     8 Dracula †                         SND (France)        $33,593,404      
##  9     9 Mana Shankara Vara Prasad Garu †  Gold Box Entertain… $32,225,000[4][5]
## 10    10 Prostokvashino  [ru]†             Cinema Atmosphere   $31,934,076
head(hg_2026, 10)
years <- 2020:2025

hg_films <- map_df(years, function(y) {
  
  url <- paste0("https://en.wikipedia.org/wiki/", y, "_in_film")
  
  page <- read_html(url)
  
  tables <- page %>% 
    html_elements("table")
  
  captions <- tables %>% 
    html_element("caption") %>% 
    html_text()
  
  hg_index <- which(str_detect(captions, "Highest-grossing films"))
  
  hg_table <- tables[[hg_index]] %>% 
    html_table(fill = TRUE)
  
  hg_table %>%
    mutate(year = y)
})

print(hg_films)
## # A tibble: 60 × 5
##     Rank Title                               Distributor `Worldwide gross`  year
##    <int> <chr>                               <chr>       <chr>             <int>
##  1     1 Demon Slayer: Kimetsu no Yaiba Mug… Toho / Ani… $507,127,293[4]    2020
##  2     2 The Eight Hundred                   CMC Pictur… $461,421,559       2020
##  3     3 My People, My Homeland              China Lion  $433,241,288[5]    2020
##  4     4 Bad Boys for Life                   Sony        $426,505,244       2020
##  5     5 Tenet                               Warner Bro… $365,309,519       2020
##  6     6 Sonic the Hedgehog                  Paramount   $319,715,683       2020
##  7     7 Dolittle                            Universal   $251,410,631       2020
##  8     8 Jiang Ziya                          Beijing En… $243,883,429       2020
##  9     9 A Little Red Flower                 HG Enterta… $238,600,000[6][…  2020
## 10    10 Shock Wave 2                        Universe F… $226,400,000       2020
## # ℹ 50 more rows
head(hg_films, 10)

End of Assignment