# install.packages("pacman")

pacman::p_load(robotstxt, rvest)

Pre-scraping check

# library(robotstxt)
paths_allowed("https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")

 www.imdb.com                      
[1] TRUE

Q6

library(rvest)

url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"

page <- read_html(url)

tables <- page %>% html_table()

length(tables)

series_cast_table <- tables[[3]]

rows <- nrow(series_cast_table)
columns <- ncol(series_cast_table)

cat('The table has', rows, 'rows and', columns, 'columns.\n')

print(head(series_cast_table))

Q7

library(rvest)
library(dplyr)

url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"

page <- read_html(url)

tables <- page %>% html_table()

series_cast_table <- tables[[3]]

cleaned_table <- series_cast_table %>% select(2, 4)

cleaned_table <- subset(cleaned_table, cleaned_table[, 1] != "" & cleaned_table[, 2] != "")


cleaned_table <- cleaned_table[!apply(cleaned_table == "", 1, all), ]


final_rows <- nrow(cleaned_table)
final_columns <- ncol(cleaned_table)

cat("The cleaned cast table has", final_rows, "oberservations and", final_columns, "columns. \n")

print(head(cleaned_table))

Q9

library(rvest)
library(dplyr)

url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"

page <- read_html(url)


visual_effects_table <- page %>% 
  html_nodes(xpath = "//h4[contains(text(), 'Series Visual Effects')]/following-sibling::table[1]") %>% 
  html_table()

visual_effects_table <- visual_effects_table[[1]]

visual_effects_table <- visual_effects_table[!is.na(visual_effects_table[[1]]) & visual_effects_table[[1]] != "", ]

num_staff <- nrow(visual_effects_table)

cat("The number of staff who worked on Series Visual Effects is", num_staff, ".\n")

print(head(visual_effects_table))

Step 1: Read the HTML Webpage

bas_html <- read_html("https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")
bas_html
{html_document}
<html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<script type="text/jav ...
[2] <body id="styleguide-v2" class="fixed">\n            <img height="1" width="1" style="display:none; ...

Step 2: Select HTML elements

tables <- html_elements(bas_html, "table")
tables

Step 3: Parse HTML Table into tibbles

tibble_list <- html_table(tables[3])
cast_tibble <- tibble_list[[1]]
cast_tibble
wiki_html <- read_html("https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")

per_html <- html_element(wiki_html,"#fullcredits_content > table.cast_list")

per <- html_table(per_html)
per
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyfQojIGluc3RhbGwucGFja2FnZXMoInBhY21hbiIpCgpwYWNtYW46OnBfbG9hZChyb2JvdHN0eHQsIHJ2ZXN0KQpgYGAKCiMgUHJlLXNjcmFwaW5nIGNoZWNrCmBgYHtyfQojIGxpYnJhcnkocm9ib3RzdHh0KQpwYXRoc19hbGxvd2VkKCJodHRwczovL3d3dy5pbWRiLmNvbS90aXRsZS90dDcyMzU0NjYvZnVsbGNyZWRpdHM/cmVmXz10dF9jbF9zbSIpCmBgYAoKUTYKYGBge3J9CmxpYnJhcnkocnZlc3QpCgp1cmwgPC0gImh0dHBzOi8vd3d3LmltZGIuY29tL3RpdGxlL3R0NzIzNTQ2Ni9mdWxsY3JlZGl0cz9yZWZfPXR0X2NsX3NtIgoKcGFnZSA8LSByZWFkX2h0bWwodXJsKQoKdGFibGVzIDwtIHBhZ2UgJT4lIGh0bWxfdGFibGUoKQoKbGVuZ3RoKHRhYmxlcykKCnNlcmllc19jYXN0X3RhYmxlIDwtIHRhYmxlc1tbM11dCgpyb3dzIDwtIG5yb3coc2VyaWVzX2Nhc3RfdGFibGUpCmNvbHVtbnMgPC0gbmNvbChzZXJpZXNfY2FzdF90YWJsZSkKCmNhdCgnVGhlIHRhYmxlIGhhcycsIHJvd3MsICdyb3dzIGFuZCcsIGNvbHVtbnMsICdjb2x1bW5zLlxuJykKCnByaW50KGhlYWQoc2VyaWVzX2Nhc3RfdGFibGUpKQpgYGAKCgpRNwpgYGB7cn0KbGlicmFyeShydmVzdCkKbGlicmFyeShkcGx5cikKCnVybCA8LSAiaHR0cHM6Ly93d3cuaW1kYi5jb20vdGl0bGUvdHQ3MjM1NDY2L2Z1bGxjcmVkaXRzP3JlZl89dHRfY2xfc20iCgpwYWdlIDwtIHJlYWRfaHRtbCh1cmwpCgp0YWJsZXMgPC0gcGFnZSAlPiUgaHRtbF90YWJsZSgpCgpzZXJpZXNfY2FzdF90YWJsZSA8LSB0YWJsZXNbWzNdXQoKY2xlYW5lZF90YWJsZSA8LSBzZXJpZXNfY2FzdF90YWJsZSAlPiUgc2VsZWN0KDIsIDQpCgpjbGVhbmVkX3RhYmxlIDwtIHN1YnNldChjbGVhbmVkX3RhYmxlLCBjbGVhbmVkX3RhYmxlWywgMV0gIT0gIiIgJiBjbGVhbmVkX3RhYmxlWywgMl0gIT0gIiIpCgoKY2xlYW5lZF90YWJsZSA8LSBjbGVhbmVkX3RhYmxlWyFhcHBseShjbGVhbmVkX3RhYmxlID09ICIiLCAxLCBhbGwpLCBdCgoKZmluYWxfcm93cyA8LSBucm93KGNsZWFuZWRfdGFibGUpCmZpbmFsX2NvbHVtbnMgPC0gbmNvbChjbGVhbmVkX3RhYmxlKQoKY2F0KCJUaGUgY2xlYW5lZCBjYXN0IHRhYmxlIGhhcyIsIGZpbmFsX3Jvd3MsICJvYmVyc2VydmF0aW9ucyBhbmQiLCBmaW5hbF9jb2x1bW5zLCAiY29sdW1ucy4gXG4iKQoKcHJpbnQoaGVhZChjbGVhbmVkX3RhYmxlKSkKYGBgCgpROQpgYGB7cn0KbGlicmFyeShydmVzdCkKbGlicmFyeShkcGx5cikKCnVybCA8LSAiaHR0cHM6Ly93d3cuaW1kYi5jb20vdGl0bGUvdHQ3MjM1NDY2L2Z1bGxjcmVkaXRzP3JlZl89dHRfY2xfc20iCgpwYWdlIDwtIHJlYWRfaHRtbCh1cmwpCgoKdmlzdWFsX2VmZmVjdHNfdGFibGUgPC0gcGFnZSAlPiUgCiAgaHRtbF9ub2Rlcyh4cGF0aCA9ICIvL2g0W2NvbnRhaW5zKHRleHQoKSwgJ1NlcmllcyBWaXN1YWwgRWZmZWN0cycpXS9mb2xsb3dpbmctc2libGluZzo6dGFibGVbMV0iKSAlPiUgCiAgaHRtbF90YWJsZSgpCgp2aXN1YWxfZWZmZWN0c190YWJsZSA8LSB2aXN1YWxfZWZmZWN0c190YWJsZVtbMV1dCgp2aXN1YWxfZWZmZWN0c190YWJsZSA8LSB2aXN1YWxfZWZmZWN0c190YWJsZVshaXMubmEodmlzdWFsX2VmZmVjdHNfdGFibGVbWzFdXSkgJiB2aXN1YWxfZWZmZWN0c190YWJsZVtbMV1dICE9ICIiLCBdCgpudW1fc3RhZmYgPC0gbnJvdyh2aXN1YWxfZWZmZWN0c190YWJsZSkKCmNhdCgiVGhlIG51bWJlciBvZiBzdGFmZiB3aG8gd29ya2VkIG9uIFNlcmllcyBWaXN1YWwgRWZmZWN0cyBpcyIsIG51bV9zdGFmZiwgIi5cbiIpCgpwcmludChoZWFkKHZpc3VhbF9lZmZlY3RzX3RhYmxlKSkKYGBgCgojIFN0ZXAgMTogUmVhZCB0aGUgSFRNTCBXZWJwYWdlCmBgYHtyfQpiYXNfaHRtbCA8LSByZWFkX2h0bWwoImh0dHBzOi8vd3d3LmltZGIuY29tL3RpdGxlL3R0NzIzNTQ2Ni9mdWxsY3JlZGl0cz9yZWZfPXR0X2NsX3NtIikKYmFzX2h0bWwKYGBgCgojIFN0ZXAgMjogU2VsZWN0IEhUTUwgZWxlbWVudHMKYGBge3J9CnRhYmxlcyA8LSBodG1sX2VsZW1lbnRzKGJhc19odG1sLCAidGFibGUiKQp0YWJsZXMKYGBgCgojIFN0ZXAgMzogUGFyc2UgIEhUTUwgVGFibGUgaW50byB0aWJibGVzCmBgYHtyfQp0aWJibGVfbGlzdCA8LSBodG1sX3RhYmxlKHRhYmxlc1szXSkKY2FzdF90aWJibGUgPC0gdGliYmxlX2xpc3RbWzFdXQpjYXN0X3RpYmJsZQpgYGAKCgpgYGB7cn0Kd2lraV9odG1sIDwtIHJlYWRfaHRtbCgiaHR0cHM6Ly93d3cuaW1kYi5jb20vdGl0bGUvdHQ3MjM1NDY2L2Z1bGxjcmVkaXRzP3JlZl89dHRfY2xfc20iKQoKcGVyX2h0bWwgPC0gaHRtbF9lbGVtZW50KHdpa2lfaHRtbCwiI2Z1bGxjcmVkaXRzX2NvbnRlbnQgPiB0YWJsZS5jYXN0X2xpc3QiKQoKcGVyIDwtIGh0bWxfdGFibGUocGVyX2h0bWwpCnBlcgpgYGAKCmBgYHtyfQoKYGBgCgo=