Install the Packages:

# install.packages("rvest")
library(rvest)

# install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# install.packages("robotstxt")
library(robotstxt)

Question 5: Check if scraping a specific directory is allowed

paths_allowed("https://en.wikipedia.org/wiki/2026_in_film")
##  en.wikipedia.org
## [1] TRUE

Question 6: Scraping the “Highest-grossing films” table data

url <- "https://en.wikipedia.org/wiki/2026_in_film"

films_2026 <- read_html(url) %>%
  html_element(".wikitable") %>% 
  html_table()
print(films_2026)
## # A tibble: 10 × 4
##     Rank Title                             Distributor         `Worldwide gross`
##    <int> <chr>                             <chr>               <chr>            
##  1     1 Cheburashka 2 †                   Central Partnership $79,559,272      
##  2     2 28 Years Later: The Bone Temple † Sony                $56,722,595      
##  3     3 Send Help †                       20th Century Studi… $54,977,893      
##  4     4 Border 2 †                        AA Films            $51,204,000[3]   
##  5     5 Mercy †                           Amazon MGM Studios… $49,802,465      
##  6     6 Return to Silent Hill †           Iconic Events Rele… $41,586,056[4]   
##  7     7 Primate †                         Paramount Pictures  $39,702,818      
##  8     8 Iron Lung †                       Markiplier Studios  $38,965,988      
##  9     9 Dracula †                         SND (France)        $33,593,404      
## 10    10 Mana Shankara Vara Prasad Garu †  Gold Box Entertain… $32,225,000[5][6]

Question 8(E.C): Write code to scrape the highest-grossing films for each year from 2020 to 2025 using a loop or functional approach. Combine all years’ data into a single data frame called hg_films, and include a variable indicating the year.

years <- 2020:2025

scrape_films <- function(year) {
  url1 <- paste0("https://en.wikipedia.org/wiki/", year, "_in_film")

Sys.sleep(1)
  
  data <- read_html(url) %>%
    html_element(".wikitable") %>%
    html_table() %>%
    mutate(Year = year) # Add the year variable
  
  return(data)
}

hg_films <- map_dfr(years, scrape_films)

glimpse(hg_films)
## Rows: 60
## Columns: 5
## $ Rank              <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, …
## $ Title             <chr> "Cheburashka 2 †", "28 Years Later: The Bone Temple …
## $ Distributor       <chr> "Central Partnership", "Sony", "20th Century Studios…
## $ `Worldwide gross` <chr> "$79,559,272", "$56,722,595", "$54,977,893", "$51,20…
## $ Year              <int> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020…