library(robotstxt)
library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
Question 5
Robots.txt: is scraping the Wikipedia page allowed?
library(robotstxt)
url <- "https://en.wikipedia.org/wiki/2026_in_film"
paths_allowed(url)
## en.wikipedia.org
## [1] TRUE
Question 6
Please try scraping the “Highest-grossing films” table data
Mark this as True if you could obtain the structured table in
R.
library(rvest)
url <- "https://en.wikipedia.org/wiki/2026_in_film"
# Read the page
page <- read_html(url)
# Extract all tables
tables <- page |>
html_elements("table") |>
html_table(fill = TRUE)
# Pick the first table that has "gross" in its column names
hg <- tables[
sapply(tables, function(x) any(grepl("gross", names(x), ignore.case = TRUE)))
][[1]]
# Print the "Highest-grossing films" table
hg
## # A tibble: 10 × 4
## Rank Title Distributor `Worldwide gross`
## <int> <chr> <chr> <chr>
## 1 1 Cheburashka 2 † Central Partnership $79,559,272
## 2 2 28 Years Later: The Bone Temple † Sony $56,794,677
## 3 3 Send Help † 20th Century Studi… $55,900,522
## 4 4 Border 2 † AA Films $51,204,000[3]
## 5 5 Mercy † Amazon MGM Studios… $49,994,297
## 6 6 Iron Lung † Markiplier Studios $42,434,920
## 7 7 Return to Silent Hill † Iconic Events Rele… $41,586,056[4]
## 8 8 Primate † Paramount Pictures $39,722,402
## 9 9 Dracula † SND (France) $33,618,925
## 10 10 Mana Shankara Vara Prasad Garu † Gold Box Entertain… $32,225,000[5][6]