library(rvest)
## Warning: package 'rvest' was built under R version 4.4.2
library(robotstxt)
## Warning: package 'robotstxt' was built under R version 4.4.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Question Five
paths_allowed("https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")
## www.imdb.com
## [1] TRUE
imbd_html <- read_html("https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")
imbd_html
## {html_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body id="styleguide-v2" class="fixed">\n <img height="1" widt ...
table_html <- html_elements(imbd_html, "table")
table_html
## {xml_nodeset (30)}
## [1] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [2] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [3] <table class="cast_list">\n<tr><td colspan="4" class="castlist_label"></ ...
## [4] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [5] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [6] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [7] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [8] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [9] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [10] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [11] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [12] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [13] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [14] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [15] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [16] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [17] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [18] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [19] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [20] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## ...
Question 6
tibble_list <- html_table(table_html[3])
tibble_list
## [[1]]
## # A tibble: 3,152 × 4
## X1 X2 X3 X4
## <lgl> <chr> <chr> <chr>
## 1 NA "" "" ""
## 2 NA "Angela Bassett" "..." "Athena Grant\n / ... \n …
## 3 NA "" "" ""
## 4 NA "Peter Krause" "..." "Bobby Nash\n 115 episodes, 20…
## 5 NA "" "" ""
## 6 NA "Oliver Stark" "..." "Evan 'Buck' Buckley\n 115 epi…
## 7 NA "" "" ""
## 8 NA "Aisha Hinds" "..." "Henrietta 'Hen' Wilson\n 115 …
## 9 NA "" "" ""
## 10 NA "Kenneth Choi" "..." "Howie 'Chimney' Han\n 115 epi…
## # ℹ 3,142 more rows
series_cast <- tibble_list[[1]]
series_cast
## # A tibble: 3,152 × 4
## X1 X2 X3 X4
## <lgl> <chr> <chr> <chr>
## 1 NA "" "" ""
## 2 NA "Angela Bassett" "..." "Athena Grant\n / ... \n …
## 3 NA "" "" ""
## 4 NA "Peter Krause" "..." "Bobby Nash\n 115 episodes, 20…
## 5 NA "" "" ""
## 6 NA "Oliver Stark" "..." "Evan 'Buck' Buckley\n 115 epi…
## 7 NA "" "" ""
## 8 NA "Aisha Hinds" "..." "Henrietta 'Hen' Wilson\n 115 …
## 9 NA "" "" ""
## 10 NA "Kenneth Choi" "..." "Howie 'Chimney' Han\n 115 epi…
## # ℹ 3,142 more rows
Question 7
cleaned_tibble <- series_cast[, c(2, 4)]
cleaned_cast <- subset(cleaned_tibble, cleaned_tibble[,1] != "" & cleaned_tibble[,2] != "")
tail(cleaned_cast)
## # A tibble: 6 × 2
## X2 X4
## <chr> <chr>
## 1 Aly Fabrizio "Trick or Treater\n \n \n (uncredited)\n \n …
## 2 Buffy Milner "Volleyball Player\n \n \n (uncredited)\n \n …
## 3 Ithaka Darin Pappas "Migrant\n \n \n (uncredited)\n \n 1…
## 4 Bryce Schmidt "Police Bugler\n \n \n (uncredited)\n \n …
## 5 Timothy T Tyler "Patient\n \n \n (uncredited)\n \n 1…
## 6 Jeffrey Viner "Car\n / ... \n \n \n (uncredited)\n \n …
Question 9
visual_effects_html <- html_element(imbd_html, "#fullcredits_content > table:nth-child(38)")
visual_effects_html
## {html_node}
## <table class="simpleTable simpleCreditsTable">
## [1] <colgroup>\n<col class="column1">\n<col class="column2">\n<col class="col ...
## [2] <tbody>\n<tr>\n<td class="name">\n<a href="/name/nm3824642/?ref_=ttfc_fc_ ...
visual_effects <- html_table(visual_effects_html)
visual_effects
## # A tibble: 196 × 3
## X1 X2 X3
## <chr> <chr> <chr>
## 1 Christian Zeiler ... digital compositor / digital compositor: FuseFX…
## 2 Katrina Duclos ... visual effects editor / visual effects editor: …
## 3 Bryant Reif ... cg supervisor (50 episodes, 2019-2022)
## 4 Tony Pirzadeh ... visual effects producer: FuseFX / visual effect…
## 5 Ezra Christian ... managing producer (46 episodes, 2021-2024)
## 6 Timothy Michael Cairns ... compositing supervisor (44 episodes, 2019-2022)
## 7 Luciano DiGeronimo ... compositing supervisor: FuseFX / digital effect…
## 8 Zachary Goodson ... visual effects supervisor / visual effects arti…
## 9 Esmeralda Ramirez ... vfx coordinator: FuseFX / vfx coordinator: Fuse…
## 10 Brigitte Bourque ... digital effects supervisor: Fuse FX / composite…
## # ℹ 186 more rows