library(robotstxt)
library(rvest)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)

Question 5

Robots.txt: is scraping the Wikipedia page allowed?

library(robotstxt)

url <- "https://en.wikipedia.org/wiki/2026_in_film"
paths_allowed(url)
##  en.wikipedia.org
## [1] TRUE

Question 6

Please try scraping the “Highest-grossing films” table data

Mark this as True if you could obtain the structured table in R.
library(rvest)

url <- "https://en.wikipedia.org/wiki/2026_in_film"

# Read the page
page <- read_html(url)

# Extract all tables
tables <- page |>
  html_elements("table") |>
  html_table(fill = TRUE)

# Pick the first table that has "gross" in its column names
hg <- tables[
  sapply(tables, function(x) any(grepl("gross", names(x), ignore.case = TRUE)))
][[1]]

# Print the "Highest-grossing films" table
hg
## # A tibble: 10 × 4
##     Rank Title                             Distributor         `Worldwide gross`
##    <int> <chr>                             <chr>               <chr>            
##  1     1 Cheburashka 2 †                   Central Partnership $79,559,272      
##  2     2 28 Years Later: The Bone Temple † Sony                $56,794,677      
##  3     3 Send Help †                       20th Century Studi… $55,900,522      
##  4     4 Border 2 †                        AA Films            $51,204,000[3]   
##  5     5 Mercy †                           Amazon MGM Studios… $49,994,297      
##  6     6 Iron Lung †                       Markiplier Studios  $42,434,920      
##  7     7 Return to Silent Hill †           Iconic Events Rele… $41,586,056[4]   
##  8     8 Primate †                         Paramount Pictures  $39,722,402      
##  9     9 Dracula †                         SND (France)        $33,618,925      
## 10    10 Mana Shankara Vara Prasad Garu †  Gold Box Entertain… $32,225,000[5][6]