library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
url<- "https://www.imdb.com/title/tt1490017/fullcredits?ref_=tt_cl_sm"
html<- read_html(url)
characters<- html_nodes(html, ".cast_list .character")
length(characters)
## [1] 35
characters[1:2]
## {xml_nodeset (2)}
## [1] <td class="character">\n <a href="/title/tt1490017/characters/ ...
## [2] <td class="character">\n <a href="/title/tt1490017/characters/ ...
To access different attributes of the currently selected nodes:
html_nodes(html, ".cast_list") %>%
html_name()
## [1] "table"
To choose only the first match use html_node instead of html_nodes
html_node(html, ".cast_list") %>%
html_name()
## [1] "table"
html_node(html, ".cast_list")
## {html_node}
## <table class="cast_list">
## [1] <tr><td colspan="4" class="castlist_label"></td></tr>\n
## [2] <tr class="odd">\n<td class="primary_photo">\n<a href="/name/nm0004715/? ...
## [3] <tr class="even">\n<td class="primary_photo">\n<a href="/name/nm0006969/ ...
## [4] <tr class="odd">\n<td class="primary_photo">\n<a href="/name/nm1911947/? ...
## [5] <tr class="even">\n<td class="primary_photo">\n<a href="/name/nm1555340/ ...
## [6] <tr class="odd">\n<td class="primary_photo">\n<a href="/name/nm0123262/? ...
## [7] <tr class="even">\n<td class="primary_photo">\n<a href="/name/nm0000355/ ...
## [8] <tr class="odd">\n<td class="primary_photo">\n<a href="/name/nm0206359/? ...
## [9] <tr class="even">\n<td class="primary_photo">\n<a href="/name/nm5322950/ ...
## [10] <tr class="odd">\n<td class="primary_photo">\n<a href="/name/nm1584992/? ...
## [11] <tr class="even">\n<td class="primary_photo">\n<a href="/name/nm0002071/ ...
## [12] <tr class="odd">\n<td class="primary_photo">\n<a href="/name/nm0287182/? ...
## [13] <tr class="even">\n<td class="primary_photo">\n<a href="/name/nm2002649/ ...
## [14] <tr class="odd">\n<td class="primary_photo">\n<a href="/name/nm0000151/? ...
## [15] <tr class="even">\n<td class="primary_photo">\n<a href="/name/nm3025399/ ...
## [16] <tr class="odd">\n<td class="primary_photo">\n<a href="/name/nm1706767/? ...
## [17] <tr class="even">\n<td class="primary_photo">\n<a href="/name/nm2159926/ ...
## [18] <tr class="odd">\n<td class="primary_photo">\n<a href="/name/nm1221047/? ...
## [19] <tr class="even">\n<td class="primary_photo">\n<a href="/name/nm2185982/ ...
## [20] <tr class="odd">\n<td class="primary_photo">\n<a href="/name/nm0003021/? ...
## ...
If the name of the current tag is table, it can usually be parsed into a data.frame automatically
html_node(html, ".cast_list") %>%
html_table %>%
head()
## # A tibble: 6 x 4
## X1 X2 X3 X4
## <lgl> <chr> <chr> <chr>
## 1 NA "" "" ""
## 2 NA "Will Arnett" "..." "Batman / \n Bruce Wayne \n \n \n (~
## 3 NA "Elizabeth Ba~ "..." "Wyldstyle / \n Lucy \n \n \n (voic~
## 4 NA "Craig Berry" "..." "Blake / \n Additional Voices \n \n ~
## 5 NA "Alison Brie" "..." "Unikitty \n \n \n (voice)"
## 6 NA "David Burrow~ "..." "Octan Robot / \n Additional Voices \n~
Extract text out of the first character node use html_node. To extract all use html_nodes
html_node(html, ".cast_list .character") %>%
html_text %>%
str_squish()
## [1] "Batman / Bruce Wayne (voice)"
Fetch urls of the actorsโ pages Attributes of html tags can be extracted using html_attrs (or html_attr() for single attribute)
html_nodes(html, ".cast_list .character") %>%
html_children %>%
html_attr("href")
## [1] "/title/tt1490017/characters/nm0004715?ref_=ttfc_fc_cl_t1"
## [2] "/title/tt1490017/characters/nm0004715?ref_=ttfc_fc_cl_t1"
## [3] "/title/tt1490017/characters/nm0006969?ref_=ttfc_fc_cl_t2"
## [4] "/title/tt1490017/characters/nm0006969?ref_=ttfc_fc_cl_t2"
## [5] "/title/tt1490017/characters/nm1911947?ref_=ttfc_fc_cl_t3"
## [6] "/title/tt1490017/characters/nm1911947?ref_=ttfc_fc_cl_t3"
## [7] "/title/tt1490017/characters/nm1555340?ref_=ttfc_fc_cl_t4"
## [8] "/title/tt1490017/characters/nm0123262?ref_=ttfc_fc_cl_t5"
## [9] "/title/tt1490017/characters/nm0123262?ref_=ttfc_fc_cl_t5"
## [10] "/title/tt1490017/characters/nm0000355?ref_=ttfc_fc_cl_t6"
## [11] "/title/tt1490017/characters/nm0206359?ref_=ttfc_fc_cl_t7"
## [12] "/title/tt1490017/characters/nm5322950?ref_=ttfc_fc_cl_t8"
## [13] "/title/tt1490017/characters/nm1584992?ref_=ttfc_fc_cl_t9"
## [14] "/title/tt1490017/characters/nm0002071?ref_=ttfc_fc_cl_t10"
## [15] "/title/tt1490017/characters/nm0002071?ref_=ttfc_fc_cl_t10"
## [16] "/title/tt1490017/characters/nm0002071?ref_=ttfc_fc_cl_t10"
## [17] "/title/tt1490017/characters/nm0287182?ref_=ttfc_fc_cl_t11"
## [18] "/title/tt1490017/characters/nm2002649?ref_=ttfc_fc_cl_t12"
## [19] "/title/tt1490017/characters/nm0000151?ref_=ttfc_fc_cl_t13"
## [20] "/title/tt1490017/characters/nm3025399?ref_=ttfc_fc_cl_t14"
## [21] "/title/tt1490017/characters/nm3025399?ref_=ttfc_fc_cl_t14"
## [22] "/title/tt1490017/characters/nm1706767?ref_=ttfc_fc_cl_t15"
## [23] "/title/tt1490017/characters/nm2159926?ref_=ttfc_fc_cl_t16"
## [24] "/title/tt1490017/characters/nm1221047?ref_=ttfc_fc_cl_t17"
## [25] "/title/tt1490017/characters/nm2185982?ref_=ttfc_fc_cl_t18"
## [26] "/title/tt1490017/characters/nm0003021?ref_=ttfc_fc_cl_t19"
## [27] "/title/tt1490017/characters/nm0003021?ref_=ttfc_fc_cl_t19"
## [28] "/title/tt1490017/characters/nm0588087?ref_=ttfc_fc_cl_t20"
## [29] "/title/tt1490017/characters/nm6352827?ref_=ttfc_fc_cl_t21"
## [30] "/title/tt1490017/characters/nm0000553?ref_=ttfc_fc_cl_t22"
## [31] "/title/tt1490017/characters/nm0000553?ref_=ttfc_fc_cl_t22"
## [32] "/title/tt1490017/characters/nm0000553?ref_=ttfc_fc_cl_t22"
## [33] "/title/tt1490017/characters/nm2511956?ref_=ttfc_fc_cl_t23"
## [34] "/title/tt1490017/characters/nm2511956?ref_=ttfc_fc_cl_t23"
## [35] "/title/tt1490017/characters/nm0641944?ref_=ttfc_fc_cl_t24"
## [36] "/title/tt1490017/characters/nm0644406?ref_=ttfc_fc_cl_t25"
## [37] "/title/tt1490017/characters/nm3658001?ref_=ttfc_fc_cl_t26"
## [38] "/title/tt1490017/characters/nm0695435?ref_=ttfc_fc_cl_t27"
## [39] "/title/tt1490017/characters/nm2361892?ref_=ttfc_fc_cl_t28"
## [40] "/title/tt1490017/characters/nm3338018?ref_=ttfc_fc_cl_t29"
## [41] "/title/tt1490017/characters/nm1130627?ref_=ttfc_fc_cl_t30"
## [42] "/title/tt1490017/characters/nm3596979?ref_=ttfc_fc_cl_t31"
## [43] "/title/tt1490017/characters/nm3596979?ref_=ttfc_fc_cl_t31"
## [44] "/title/tt1490017/characters/nm1672246?ref_=ttfc_fc_cl_t32"
## [45] "/title/tt1490017/characters/nm1672246?ref_=ttfc_fc_cl_t32"
## [46] "/title/tt1490017/characters/nm1475594?ref_=ttfc_fc_cl_t33"
## [47] "/title/tt1490017/characters/nm0001850?ref_=ttfc_fc_cl_t34"
## [48] "/title/tt1490017/characters/nm4341923?ref_=ttfc_fc_cl_t35"
Return text from child nodes
html_node(html, ".cast_list .character") %>%
html_children %>%
html_text()
## [1] "Batman" "Bruce Wayne"
Use xpath to extract only text from the parent node
html_node(html, ".cast_list .character") %>%
html_nodes(xpath="./text()[normalize-space()]")
## {xml_nodeset (2)}
## [1] / \n
## [2] \n \n \n (voice)\n \n \n
Simplify the output and return values in a normal list
html_node(html, ".cast_list .character") %>%
html_nodes(xpath="./text()[normalize-space()]") %>%
html_text(trim=TRUE)
## [1] "/" "(voice)"