#install.packages("robotstxt")
#install.packages("rvest")
#install.packages("pacman")
library(pacman)
pacman::p_load(robotstxt,rvest)
Check if scraping directory is allowed
paths_allowed("https://en.wikipedia.org/wiki/2026_in_film")
## en.wikipedia.org
## [1] TRUE
Scraping “Highest-Grossing-films” table
all_tables <- read_html("https://en.wikipedia.org/wiki/2026_in_film")
high_gross_table <- html_element(all_tables, "#mw-content-text > div.mw-content-ltr.mw-parser-output > table:nth-child(15)") %>%
html_table()
Downloading data from 2020-2025
library(rvest)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# 1. Define the years
years_to_scrape <- 2020:2025
# 2. Refined function with unique variable names
scrape_year <- function(selected_year) {
# Construct URL
url <- paste0("https://en.wikipedia.org/wiki/", selected_year, "_in_film")
# Read and Parse
table_data <- read_html(url) %>%
html_element("table.wikitable") %>%
html_table() %>%
# Use the specific argument name here to avoid the 'closure' error
mutate(Year = selected_year)
return(table_data)
}
# 3. Execute
hg_films <- map_df(years_to_scrape, scrape_year)
hg_films
## # A tibble: 60 × 5
## Rank Title Distributor `Worldwide gross` Year
## <int> <chr> <chr> <chr> <int>
## 1 1 Demon Slayer: Kimetsu no Yaiba Mug… Toho / Ani… $507,127,293[4] 2020
## 2 2 The Eight Hundred CMC Pictur… $461,421,559 2020
## 3 3 My People, My Homeland China Lion $433,241,288[5] 2020
## 4 4 Bad Boys for Life Sony $426,505,244 2020
## 5 5 Tenet Warner Bro… $365,309,519 2020
## 6 6 Sonic the Hedgehog Paramount $319,715,683 2020
## 7 7 Dolittle Universal $251,410,631 2020
## 8 8 Jiang Ziya Beijing En… $243,883,429 2020
## 9 9 A Little Red Flower HG Enterta… $238,600,000[6][… 2020
## 10 10 Shock Wave 2 Universe F… $226,400,000 2020
## # ℹ 50 more rows