Scraping table data from the Wiki webpage
library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
url <- "https://en.wikipedia.org/wiki/2026_in_film"
page <- read_html(url)
Scraping the “Highest Grossing Film” table data from the Wiki webpage
tables <- page %>% html_elements("table") %>% html_table(fill = TRUE)
length(tables)
## [1] 7
tables[[3]] %>% head()
## # A tibble: 6 × 4
## Rank Title Distributor `Worldwide gross`
## <int> <chr> <chr> <chr>
## 1 1 Cheburashka 2 † Central Partnership $79,559,272
## 2 2 28 Years Later: The Bone Temple † Sony $56,794,677
## 3 3 Send Help † 20th Century Studios $55,900,522
## 4 4 Border 2 † AA Films $51,204,000[3]
## 5 5 Mercy † Amazon MGM Studios … $49,994,297
## 6 6 Iron Lung † Markiplier Studios $42,434,920
highest_grossing_films <- tables[[3]]
Writing code to scrape the highest-grossing films for each year from 2020 to 2025 using a loop or functional approach
library(rvest)
library(dplyr)
library(purrr)
library(stringr)
scrape_year <- function(year) {
url <- paste0("https://en.wikipedia.org/wiki/", year, "_in_film")
page <- read_html(url)
tables <- page %>%
html_elements("table.wikitable") %>%
html_table(fill = TRUE)
hg_table <- tables[[1]] %>%
mutate(year = year)
return(hg_table)
}
hg_films_2020_2025 <- map_dfr(2020:2025, scrape_year)
head(hg_films_2020_2025)
## # A tibble: 6 × 5
## Rank Title Distributor `Worldwide gross` year
## <int> <chr> <chr> <chr> <int>
## 1 1 Demon Slayer: Kimetsu no Yaiba Muge… Toho / Ani… $507,127,293[4] 2020
## 2 2 The Eight Hundred CMC Pictur… $461,421,559 2020
## 3 3 My People, My Homeland China Lion $433,241,288[5] 2020
## 4 4 Bad Boys for Life Sony $426,505,244 2020
## 5 5 Tenet Warner Bro… $365,309,519 2020
## 6 6 Sonic the Hedgehog Paramount $319,715,683 2020