Assignment3

# install.packages("robotstxt")
library(robotstxt)

# Website I would like to scrape
url <- "https://en.wikipedia.org/wiki/2024_in_film"

# Check if scraping is allowed
permissions <- paths_allowed(url)

##  en.wikipedia.org

# Print result
if (permissions) {
  cat("Scraping is allowed for:", url, "\n")
} else {
  cat("Scraping is NOT allowed for:", url, "\n")
}

## Scraping is allowed for: https://en.wikipedia.org/wiki/2024_in_film

# Load rvest
library(rvest)

# Website URL
url <- "https://en.wikipedia.org/wiki/2024_in_film"

# Read the HTML page
doc <- read_html(url)

# Get the first table (Highest-grossing films)
highest_grossing <- doc %>%
  html_element("table") %>%
  html_table(fill = TRUE)

# Show first rows
head(highest_grossing)

## # A tibble: 3 × 1
##   `List of years in film`                                                       
##   <chr>                                                                         
## 1 "… 2014\n2015\n2016\n2017\n2018\n2019\n2020\n2021\n2022\n2023\n2024\n2025\n20…
## 2 "Art\nArchaeology\nArchitecture\n\nLiterature\nMusic\nPhilosophy\nScience+..."
## 3 ".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.m…

# Check if we got a structured table
cat("Q6 – Obtained structured table in R:", is.data.frame(highest_grossing), "\n")

## Q6 – Obtained structured table in R: TRUE

Assignment3

Kosta

2025-09-28