Installing Packages

#install.packages("robotstxt")
#install.packages("rvest")
#install.packages("pacman")

library(pacman)
pacman::p_load(robotstxt,rvest)

Question 5

Check if scraping directory is allowed

paths_allowed("https://en.wikipedia.org/wiki/2026_in_film")
##  en.wikipedia.org
## [1] TRUE

Question 6

Scraping “Highest-Grossing-films” table

all_tables <- read_html("https://en.wikipedia.org/wiki/2026_in_film")

high_gross_table <- html_element(all_tables, "#mw-content-text > div.mw-content-ltr.mw-parser-output > table:nth-child(15)") %>%
  html_table()

Question 8

Downloading data from 2020-2025

library(rvest)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# 1. Define the years
years_to_scrape <- 2020:2025

# 2. Refined function with unique variable names
scrape_year <- function(selected_year) {
  
  # Construct URL
  url <- paste0("https://en.wikipedia.org/wiki/", selected_year, "_in_film")
  
  # Read and Parse
  table_data <- read_html(url) %>%
    html_element("table.wikitable") %>%
    html_table() %>%
    # Use the specific argument name here to avoid the 'closure' error
    mutate(Year = selected_year) 
  
  return(table_data)
}

# 3. Execute
hg_films <- map_df(years_to_scrape, scrape_year)

hg_films
## # A tibble: 60 × 5
##     Rank Title                               Distributor `Worldwide gross`  Year
##    <int> <chr>                               <chr>       <chr>             <int>
##  1     1 Demon Slayer: Kimetsu no Yaiba Mug… Toho / Ani… $507,127,293[4]    2020
##  2     2 The Eight Hundred                   CMC Pictur… $461,421,559       2020
##  3     3 My People, My Homeland              China Lion  $433,241,288[5]    2020
##  4     4 Bad Boys for Life                   Sony        $426,505,244       2020
##  5     5 Tenet                               Warner Bro… $365,309,519       2020
##  6     6 Sonic the Hedgehog                  Paramount   $319,715,683       2020
##  7     7 Dolittle                            Universal   $251,410,631       2020
##  8     8 Jiang Ziya                          Beijing En… $243,883,429       2020
##  9     9 A Little Red Flower                 HG Enterta… $238,600,000[6][…  2020
## 10    10 Shock Wave 2                        Universe F… $226,400,000       2020
## # ℹ 50 more rows