Q5
# install.packages("robotstxt")
library(robotstxt)
# Website I would like to scrape
url <- "https://en.wikipedia.org/wiki/2024_in_film"
# Check if scraping is allowed
permissions <- paths_allowed(url)
## en.wikipedia.org
# Print result
if (permissions) {
cat("Scraping is allowed for:", url, "\n")
} else {
cat("Scraping is NOT allowed for:", url, "\n")
}
## Scraping is allowed for: https://en.wikipedia.org/wiki/2024_in_film
Q6
# Load rvest
library(rvest)
# Website URL
url <- "https://en.wikipedia.org/wiki/2024_in_film"
# Read the HTML page
doc <- read_html(url)
# Get the first table (Highest-grossing films)
highest_grossing <- doc %>%
html_element("table") %>%
html_table(fill = TRUE)
# Show first rows
head(highest_grossing)
## # A tibble: 3 × 1
## `List of years in film`
## <chr>
## 1 "… 2014\n2015\n2016\n2017\n2018\n2019\n2020\n2021\n2022\n2023\n2024\n2025\n20…
## 2 "Art\nArchaeology\nArchitecture\n\nLiterature\nMusic\nPhilosophy\nScience+..."
## 3 ".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.m…
# Check if we got a structured table
cat("Q6 – Obtained structured table in R:", is.data.frame(highest_grossing), "\n")
## Q6 – Obtained structured table in R: TRUE