#Q4: rvst

library(rvest) 
## Warning: package 'rvest' was built under R version 4.5.2
library(robotstxt) #allows you to processing file
## Warning: package 'robotstxt' was built under R version 4.5.2
install.packages("robotstxt")
## Warning: package 'robotstxt' is in use and will not be installed

#Q5: Scraping

install.packages("robotstxt")
## Warning: package 'robotstxt' is in use and will not be installed
library(robotstxt)
paths_allowed("https://en.wikipedia.org/wiki/2026_in_film")
## Warning: package 'future' was built under R version 4.5.2
##  en.wikipedia.org
## [1] TRUE

#Q6: Highst gross Films

library(rvest)
Films_page <- "https://en.wikipedia.org/wiki/2026_in_film"

hg_films <- read_html(Films_page) %>% 
  html_table()

hg_films[[1]]
## # A tibble: 3 × 1
##   `List of years in film`                                                       
##   <chr>                                                                         
## 1 "… 2016\n2017\n2018\n2019\n2020\n2021\n2022\n2023\n2024\n2025\n2026\n2027\n20…
## 2 "Art\nArchaeology\nArchitecture\n\nLiterature\nMusic\nPhilosophy\nScience+..."
## 3 ".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.m…

#Q7: Ethical and Legal

install.packages("robotstxt")
## Warning: package 'robotstxt' is in use and will not be installed
library(robotstxt)
paths_allowed("https://terms.yelp.com/tos/en_us/20200101_en_us/", "https://www.linkedin.com/legal/l/service-terms", "chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://cdn.ca9.uscourts.gov/datastore/opinions/2019/09/09/17-16783.pdf", "https://techcrunch.com/2021/06/14/supreme-court-revives-linkedin-bid-to-protect-user-data-from-web-scrapers/" )
## https://www.linkedin.com/legal/l/service-terms
## Warning in request_handler_handler(request = request, handler = on_not_found, :
## Event: on_not_found
## Warning in request_handler_handler(request = request, handler =
## on_file_type_mismatch, : Event: on_file_type_mismatch
## Warning in request_handler_handler(request = request, handler =
## on_suspect_content, : Event: on_suspect_content
## 
## [1] TRUE