pacman::p_load(robotstxt, rvest)
paths_allowed("https://www.basketball-reference.com/boxscores/?month=6&day=14&year=2024")
## www.basketball-reference.com
## [1] TRUE
## www.basketball-reference.com
## [1] TRUE
bas_html <- read_html("https://www.basketball-reference.com/boxscores/?month=6&day=14&year=2024")
bas_html
## {html_document}
## <html data-version="klecko-" data-root="/home/bbr/deploy/www" lang="en" class="no-js">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="bbr">\n<div id="wrap">\n \n <div id="header" role="banner" ...
table_html <- html_elements(bas_html, "table")
table_html
## {xml_nodeset (5)}
## [1] <table class="teams"><tbody>\n<tr class="loser">\n<td><a href="/teams/BOS ...
## [2] <table>\n<thead><tr>\n<th></th>\n<!-- the team labels -->\n\t\t\t<th>1</t ...
## [3] <table class="stats"><tbody>\n<tr>\n<td><strong>PTS</strong></td>\n\t\t\t ...
## [4] <table class="suppress_all sortable stats_table" id="confs_standings_E" d ...
## [5] <table class="suppress_all sortable stats_table" id="confs_standings_W" d ...
tibble_list <- html_table(table_html[4])
# Extract the content from the list
eastern_tibble <- tibble_list[[1]]
eastern_tibble
## # A tibble: 15 × 7
## `Eastern Conference` W L `W/L%` GB `PS/G` `PA/G`
## <chr> <int> <int> <dbl> <chr> <dbl> <dbl>
## 1 Boston Celtics* 64 18 0.78 — 121. 109.
## 2 New York Knicks* 50 32 0.61 14.0 113. 108.
## 3 Milwaukee Bucks* 49 33 0.598 15.0 119 116.
## 4 Cleveland Cavaliers* 48 34 0.585 16.0 113. 110.
## 5 Orlando Magic* 47 35 0.573 17.0 110. 108.
## 6 Indiana Pacers* 47 35 0.573 17.0 123. 120.
## 7 Philadelphia 76ers* 47 35 0.573 17.0 115. 112.
## 8 Miami Heat* 46 36 0.561 18.0 110. 108.
## 9 Chicago Bulls 39 43 0.476 25.0 112. 114.
## 10 Atlanta Hawks 36 46 0.439 28.0 118. 120.
## 11 Brooklyn Nets 32 50 0.39 32.0 110. 113.
## 12 Toronto Raptors 25 57 0.305 39.0 112. 119.
## 13 Charlotte Hornets 21 61 0.256 43.0 107. 117.
## 14 Washington Wizards 15 67 0.183 49.0 114. 123
## 15 Detroit Pistons 14 68 0.171 50.0 110. 119
tibble_list <- html_table(table_html[5])
# Extract the content from the list
western_tibble <- tibble_list[[1]]
western_tibble
## # A tibble: 15 × 7
## `Western Conference` W L `W/L%` GB `PS/G` `PA/G`
## <chr> <int> <int> <dbl> <chr> <dbl> <dbl>
## 1 Oklahoma City Thunder* 57 25 0.695 — 120. 113.
## 2 Denver Nuggets* 57 25 0.695 — 115. 110.
## 3 Minnesota Timberwolves* 56 26 0.683 1.0 113 106.
## 4 Los Angeles Clippers* 51 31 0.622 6.0 116. 112.
## 5 Dallas Mavericks* 50 32 0.61 7.0 118. 116.
## 6 New Orleans Pelicans* 49 33 0.598 8.0 115. 111.
## 7 Phoenix Suns* 49 33 0.598 8.0 116. 113.
## 8 Los Angeles Lakers* 47 35 0.573 10.0 118 117.
## 9 Sacramento Kings 46 36 0.561 11.0 117. 115.
## 10 Golden State Warriors 46 36 0.561 11.0 118. 115.
## 11 Houston Rockets 41 41 0.5 16.0 114. 113.
## 12 Utah Jazz 31 51 0.378 26.0 116. 120.
## 13 Memphis Grizzlies 27 55 0.329 30.0 106. 113.
## 14 San Antonio Spurs 22 60 0.268 35.0 112. 119.
## 15 Portland Trail Blazers 21 61 0.256 36.0 106. 115.
names(eastern_tibble) <- c("Teams", "Wins","Losses", "Win/Loss%",
"Games behind",
"Pts per game", "Opp pts per game")
names(western_tibble) <- c("Teams", "Wins","Losses", "Win/Loss%",
"Games behind",
"Pts per game", "Opp pts per game")
eastern_tibble
## # A tibble: 15 × 7
## Teams Wins Losses `Win/Loss%` `Games behind` `Pts per game`
## <chr> <int> <int> <dbl> <chr> <dbl>
## 1 Boston Celtics* 64 18 0.78 — 121.
## 2 New York Knicks* 50 32 0.61 14.0 113.
## 3 Milwaukee Bucks* 49 33 0.598 15.0 119
## 4 Cleveland Cavaliers* 48 34 0.585 16.0 113.
## 5 Orlando Magic* 47 35 0.573 17.0 110.
## 6 Indiana Pacers* 47 35 0.573 17.0 123.
## 7 Philadelphia 76ers* 47 35 0.573 17.0 115.
## 8 Miami Heat* 46 36 0.561 18.0 110.
## 9 Chicago Bulls 39 43 0.476 25.0 112.
## 10 Atlanta Hawks 36 46 0.439 28.0 118.
## 11 Brooklyn Nets 32 50 0.39 32.0 110.
## 12 Toronto Raptors 25 57 0.305 39.0 112.
## 13 Charlotte Hornets 21 61 0.256 43.0 107.
## 14 Washington Wizards 15 67 0.183 49.0 114.
## 15 Detroit Pistons 14 68 0.171 50.0 110.
## # ℹ 1 more variable: `Opp pts per game` <dbl>
wiki_html <- read_html("https://en.wikipedia.org/wiki/Player_efficiency_rating")
per_html <- html_element(wiki_html,"#mw-content-text > div.mw-content-ltr.mw-parser-output > table.wikitable.sortable")
per <- html_table(per_html)
per
## # A tibble: 25 × 5
## Rank Player Pos `Team(s) played for (years)[a]` PER
## <int> <chr> <chr> <chr> <dbl>
## 1 1 Nikola Jokić^ C Denver Nuggets (2015–present) 28.5
## 2 2 Michael Jordan* SG Chicago Bulls (1984–1993, 1995–1998… 27.9
## 3 3 LeBron James^ SF Cleveland Cavaliers (2003–2010, 201… 27.0
## 4 4 Anthony Davis^ PF/C New Orleans Hornets/Pelicans (2012–… 26.9
## 5 5 Shaquille O'Neal* C Orlando Magic (1992–1996)Los Angele… 26.4
## 6 6 David Robinson* C San Antonio Spurs (1989–2003) 26.2
## 7 7 Wilt Chamberlain* C Philadelphia/San Francisco Warriors… 26.2
## 8 8 Giannis Antetokounmpo^ PF Milwaukee Bucks (2013–present) 25.6
## 9 9 Bob Pettit* PF/C Milwaukee/St. Louis Hawks (1954–196… 25.4
## 10 10 Kevin Durant^ SF/PF Seattle SuperSonics/Oklahoma City T… 24.9
## # ℹ 15 more rows
library(robotstxt)
Define the URL to check
url <- "https://www.imdb.com/title/tt7235466/fullcredits"
Check if the path is allowed to be scraped
is_allowed <- paths_allowed(url)
## www.imdb.com
Print the result
if (is_allowed) {
print("Scraping is allowed for this page.")
} else {
print("Scraping is NOT allowed for this page.")
}
## [1] "Scraping is allowed for this page."
library(rvest)
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)
Get all tables on the page
tables <- html_nodes(webpage, "table")
View the number of rows and columns
dim(series_cast_table)
## NULL
library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
URL of the IMDb full cast page
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
Read the HTML content
webpage <- read_html(url)
Select the 3rd table (Series Cast)
cast_table <- html_table(tables[[3]], fill = TRUE)
Clean the data
clean_cast <- cast_table %>%
select(2, 4) %>% # Keep only the 2nd and 4th columns
filter(!is.na(X2) & X2 != "", !is.na(X4) & X4 != "") # Remove blank rows
Clean
clean_cast <- clean_cast %>% filter(X2 != "" & X4 != "")
Final count
num_rows <- nrow(clean_cast)
num_columns <- ncol(clean_cast)
cat("Final cleaned dataset contains:", num_rows, "observations and", num_columns, "columns.\n")
## Final cleaned dataset contains: 1575 observations and 2 columns.
Cleaned dataset
head(clean_cast)
## # A tibble: 6 × 2
## X2 X4
## <chr> <chr>
## 1 Angela Bassett "Athena Grant\n / ... \n 115 e…
## 2 Peter Krause "Bobby Nash\n 115 episodes, 2018-2025"
## 3 Oliver Stark "Evan 'Buck' Buckley\n 115 episodes, 20…
## 4 Aisha Hinds "Henrietta 'Hen' Wilson\n 115 episodes,…
## 5 Kenneth Choi "Howie 'Chimney' Han\n 115 episodes, 20…
## 6 Jennifer Love Hewitt "Maddie Kendall\n / ... \n 105…
raw_visual <-html_element(webpage,"#fullcredits_content > table:nth-child(38)")
head(raw_visual)
## $node
## <pointer: 0x1120de980>
##
## $doc
## <pointer: 0x10dd04720>
parse_visual <- html_table(raw_visual)
head(parse_visual)
## # A tibble: 6 × 3
## X1 X2 X3
## <chr> <chr> <chr>
## 1 Christian Zeiler ... digital compositor / digital compositor: FuseFX …
## 2 Katrina Duclos ... visual effects editor / visual effects editor: F…
## 3 Bryant Reif ... cg supervisor (50 episodes, 2019-2022)
## 4 Tony Pirzadeh ... visual effects producer: FuseFX / visual effects…
## 5 Ezra Christian ... managing producer (46 episodes, 2021-2024)
## 6 Timothy Michael Cairns ... compositing supervisor (44 episodes, 2019-2022)