Install the Packages:
# install.packages("rvest")
library(rvest)
# install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# install.packages("robotstxt")
library(robotstxt)
Question 5: Check if scraping a specific directory is allowed
paths_allowed("https://en.wikipedia.org/wiki/2026_in_film")
## en.wikipedia.org
## [1] TRUE
Question 6: Scraping the “Highest-grossing films” table data
url <- "https://en.wikipedia.org/wiki/2026_in_film"
films_2026 <- read_html(url) %>%
html_element(".wikitable") %>%
html_table()
print(films_2026)
## # A tibble: 10 × 4
## Rank Title Distributor `Worldwide gross`
## <int> <chr> <chr> <chr>
## 1 1 Cheburashka 2 † Central Partnership $79,559,272
## 2 2 28 Years Later: The Bone Temple † Sony $56,722,595
## 3 3 Send Help † 20th Century Studi… $54,977,893
## 4 4 Border 2 † AA Films $51,204,000[3]
## 5 5 Mercy † Amazon MGM Studios… $49,802,465
## 6 6 Return to Silent Hill † Iconic Events Rele… $41,586,056[4]
## 7 7 Primate † Paramount Pictures $39,702,818
## 8 8 Iron Lung † Markiplier Studios $38,965,988
## 9 9 Dracula † SND (France) $33,593,404
## 10 10 Mana Shankara Vara Prasad Garu † Gold Box Entertain… $32,225,000[5][6]
Question 8(E.C): Write code to scrape the highest-grossing films for
each year from 2020 to 2025 using a loop or functional approach. Combine
all years’ data into a single data frame called hg_films, and include a
variable indicating the year.
years <- 2020:2025
scrape_films <- function(year) {
url1 <- paste0("https://en.wikipedia.org/wiki/", year, "_in_film")
Sys.sleep(1)
data <- read_html(url) %>%
html_element(".wikitable") %>%
html_table() %>%
mutate(Year = year) # Add the year variable
return(data)
}
hg_films <- map_dfr(years, scrape_films)
glimpse(hg_films)
## Rows: 60
## Columns: 5
## $ Rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, …
## $ Title <chr> "Cheburashka 2 †", "28 Years Later: The Bone Temple …
## $ Distributor <chr> "Central Partnership", "Sony", "20th Century Studios…
## $ `Worldwide gross` <chr> "$79,559,272", "$56,722,595", "$54,977,893", "$51,20…
## $ Year <int> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020…