library(robotstxt)
library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Question 5
paths_allowed("https://imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")
## imdb.com
## [1] TRUE
Question 4 & 6
CastCrew_html <- read_html("https://imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")
CastCrew_html
## {html_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body id="styleguide-v2" class="fixed">\n <img height="1" widt ...
table_html <- html_elements(CastCrew_html, "table")
table_html
## {xml_nodeset (30)}
## [1] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [2] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [3] <table class="cast_list">\n<tr><td colspan="4" class="castlist_label"></ ...
## [4] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [5] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [6] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [7] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [8] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [9] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [10] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [11] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [12] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [13] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [14] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [15] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [16] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [17] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [18] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [19] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## [20] <table class="simpleTable simpleCreditsTable">\n<colgroup>\n<col class=" ...
## ...
tibble_list <- html_table(table_html[3])
tibble_list
## [[1]]
## # A tibble: 3,152 × 4
## X1 X2 X3 X4
## <lgl> <chr> <chr> <chr>
## 1 NA "" "" ""
## 2 NA "Angela Bassett" "..." "Athena Grant\n / ... \n …
## 3 NA "" "" ""
## 4 NA "Peter Krause" "..." "Bobby Nash\n 115 episodes, 20…
## 5 NA "" "" ""
## 6 NA "Oliver Stark" "..." "Evan 'Buck' Buckley\n 115 epi…
## 7 NA "" "" ""
## 8 NA "Aisha Hinds" "..." "Henrietta 'Hen' Wilson\n 115 …
## 9 NA "" "" ""
## 10 NA "Kenneth Choi" "..." "Howie 'Chimney' Han\n 115 epi…
## # ℹ 3,142 more rows
Question 7
clean_tibble <- tibble_list[[1]]
CastCrew_list <- clean_tibble[, c(2,4)]
CastCrew_list
CastCrew_list <- subset(CastCrew_list, CastCrew_list[,1] != "" & CastCrew_list[,2] != "")
CastCrew_list
Question 8
names(clean_tibble) <- c("x2","x4")
## Warning: The `value` argument of `names<-()` must have the same length as `x` as of
## tibble 3.0.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `value` argument of `names<-()` can't be empty as of tibble 3.0.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
colnames(clean_tibble) <- c("x2","x4")
Question 9
CastCrew_html <- read_html("https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")
SeriesVisual_html <- html_element(CastCrew_html, "#fullcredits_content > table:nth-child(38)")
SeriesVisual_html <- html_table(SeriesVisual_html)
SeriesVisual_html