# install.packages("pacman")
pacman::p_load(robotstxt, rvest)
Pre-scraping check
# library(robotstxt)
paths_allowed("https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")
www.imdb.com
[1] TRUE
Q6
library(rvest)
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
page <- read_html(url)
tables <- page %>% html_table()
length(tables)
series_cast_table <- tables[[3]]
rows <- nrow(series_cast_table)
columns <- ncol(series_cast_table)
cat('The table has', rows, 'rows and', columns, 'columns.\n')
print(head(series_cast_table))
Q7
library(rvest)
library(dplyr)
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
page <- read_html(url)
tables <- page %>% html_table()
series_cast_table <- tables[[3]]
cleaned_table <- series_cast_table %>% select(2, 4)
cleaned_table <- subset(cleaned_table, cleaned_table[, 1] != "" & cleaned_table[, 2] != "")
cleaned_table <- cleaned_table[!apply(cleaned_table == "", 1, all), ]
final_rows <- nrow(cleaned_table)
final_columns <- ncol(cleaned_table)
cat("The cleaned cast table has", final_rows, "oberservations and", final_columns, "columns. \n")
print(head(cleaned_table))
Q9
library(rvest)
library(dplyr)
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
page <- read_html(url)
visual_effects_table <- page %>%
html_nodes(xpath = "//h4[contains(text(), 'Series Visual Effects')]/following-sibling::table[1]") %>%
html_table()
visual_effects_table <- visual_effects_table[[1]]
visual_effects_table <- visual_effects_table[!is.na(visual_effects_table[[1]]) & visual_effects_table[[1]] != "", ]
num_staff <- nrow(visual_effects_table)
cat("The number of staff who worked on Series Visual Effects is", num_staff, ".\n")
print(head(visual_effects_table))
Step 1: Read the HTML Webpage
bas_html <- read_html("https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")
bas_html
{html_document}
<html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<script type="text/jav ...
[2] <body id="styleguide-v2" class="fixed">\n <img height="1" width="1" style="display:none; ...
Step 2: Select HTML elements
tables <- html_elements(bas_html, "table")
tables
Step 3: Parse HTML Table into tibbles
tibble_list <- html_table(tables[3])
cast_tibble <- tibble_list[[1]]
cast_tibble
wiki_html <- read_html("https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")
per_html <- html_element(wiki_html,"#fullcredits_content > table.cast_list")
per <- html_table(per_html)
per
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyfQojIGluc3RhbGwucGFja2FnZXMoInBhY21hbiIpCgpwYWNtYW46OnBfbG9hZChyb2JvdHN0eHQsIHJ2ZXN0KQpgYGAKCiMgUHJlLXNjcmFwaW5nIGNoZWNrCmBgYHtyfQojIGxpYnJhcnkocm9ib3RzdHh0KQpwYXRoc19hbGxvd2VkKCJodHRwczovL3d3dy5pbWRiLmNvbS90aXRsZS90dDcyMzU0NjYvZnVsbGNyZWRpdHM/cmVmXz10dF9jbF9zbSIpCmBgYAoKUTYKYGBge3J9CmxpYnJhcnkocnZlc3QpCgp1cmwgPC0gImh0dHBzOi8vd3d3LmltZGIuY29tL3RpdGxlL3R0NzIzNTQ2Ni9mdWxsY3JlZGl0cz9yZWZfPXR0X2NsX3NtIgoKcGFnZSA8LSByZWFkX2h0bWwodXJsKQoKdGFibGVzIDwtIHBhZ2UgJT4lIGh0bWxfdGFibGUoKQoKbGVuZ3RoKHRhYmxlcykKCnNlcmllc19jYXN0X3RhYmxlIDwtIHRhYmxlc1tbM11dCgpyb3dzIDwtIG5yb3coc2VyaWVzX2Nhc3RfdGFibGUpCmNvbHVtbnMgPC0gbmNvbChzZXJpZXNfY2FzdF90YWJsZSkKCmNhdCgnVGhlIHRhYmxlIGhhcycsIHJvd3MsICdyb3dzIGFuZCcsIGNvbHVtbnMsICdjb2x1bW5zLlxuJykKCnByaW50KGhlYWQoc2VyaWVzX2Nhc3RfdGFibGUpKQpgYGAKCgpRNwpgYGB7cn0KbGlicmFyeShydmVzdCkKbGlicmFyeShkcGx5cikKCnVybCA8LSAiaHR0cHM6Ly93d3cuaW1kYi5jb20vdGl0bGUvdHQ3MjM1NDY2L2Z1bGxjcmVkaXRzP3JlZl89dHRfY2xfc20iCgpwYWdlIDwtIHJlYWRfaHRtbCh1cmwpCgp0YWJsZXMgPC0gcGFnZSAlPiUgaHRtbF90YWJsZSgpCgpzZXJpZXNfY2FzdF90YWJsZSA8LSB0YWJsZXNbWzNdXQoKY2xlYW5lZF90YWJsZSA8LSBzZXJpZXNfY2FzdF90YWJsZSAlPiUgc2VsZWN0KDIsIDQpCgpjbGVhbmVkX3RhYmxlIDwtIHN1YnNldChjbGVhbmVkX3RhYmxlLCBjbGVhbmVkX3RhYmxlWywgMV0gIT0gIiIgJiBjbGVhbmVkX3RhYmxlWywgMl0gIT0gIiIpCgoKY2xlYW5lZF90YWJsZSA8LSBjbGVhbmVkX3RhYmxlWyFhcHBseShjbGVhbmVkX3RhYmxlID09ICIiLCAxLCBhbGwpLCBdCgoKZmluYWxfcm93cyA8LSBucm93KGNsZWFuZWRfdGFibGUpCmZpbmFsX2NvbHVtbnMgPC0gbmNvbChjbGVhbmVkX3RhYmxlKQoKY2F0KCJUaGUgY2xlYW5lZCBjYXN0IHRhYmxlIGhhcyIsIGZpbmFsX3Jvd3MsICJvYmVyc2VydmF0aW9ucyBhbmQiLCBmaW5hbF9jb2x1bW5zLCAiY29sdW1ucy4gXG4iKQoKcHJpbnQoaGVhZChjbGVhbmVkX3RhYmxlKSkKYGBgCgpROQpgYGB7cn0KbGlicmFyeShydmVzdCkKbGlicmFyeShkcGx5cikKCnVybCA8LSAiaHR0cHM6Ly93d3cuaW1kYi5jb20vdGl0bGUvdHQ3MjM1NDY2L2Z1bGxjcmVkaXRzP3JlZl89dHRfY2xfc20iCgpwYWdlIDwtIHJlYWRfaHRtbCh1cmwpCgoKdmlzdWFsX2VmZmVjdHNfdGFibGUgPC0gcGFnZSAlPiUgCiAgaHRtbF9ub2Rlcyh4cGF0aCA9ICIvL2g0W2NvbnRhaW5zKHRleHQoKSwgJ1NlcmllcyBWaXN1YWwgRWZmZWN0cycpXS9mb2xsb3dpbmctc2libGluZzo6dGFibGVbMV0iKSAlPiUgCiAgaHRtbF90YWJsZSgpCgp2aXN1YWxfZWZmZWN0c190YWJsZSA8LSB2aXN1YWxfZWZmZWN0c190YWJsZVtbMV1dCgp2aXN1YWxfZWZmZWN0c190YWJsZSA8LSB2aXN1YWxfZWZmZWN0c190YWJsZVshaXMubmEodmlzdWFsX2VmZmVjdHNfdGFibGVbWzFdXSkgJiB2aXN1YWxfZWZmZWN0c190YWJsZVtbMV1dICE9ICIiLCBdCgpudW1fc3RhZmYgPC0gbnJvdyh2aXN1YWxfZWZmZWN0c190YWJsZSkKCmNhdCgiVGhlIG51bWJlciBvZiBzdGFmZiB3aG8gd29ya2VkIG9uIFNlcmllcyBWaXN1YWwgRWZmZWN0cyBpcyIsIG51bV9zdGFmZiwgIi5cbiIpCgpwcmludChoZWFkKHZpc3VhbF9lZmZlY3RzX3RhYmxlKSkKYGBgCgojIFN0ZXAgMTogUmVhZCB0aGUgSFRNTCBXZWJwYWdlCmBgYHtyfQpiYXNfaHRtbCA8LSByZWFkX2h0bWwoImh0dHBzOi8vd3d3LmltZGIuY29tL3RpdGxlL3R0NzIzNTQ2Ni9mdWxsY3JlZGl0cz9yZWZfPXR0X2NsX3NtIikKYmFzX2h0bWwKYGBgCgojIFN0ZXAgMjogU2VsZWN0IEhUTUwgZWxlbWVudHMKYGBge3J9CnRhYmxlcyA8LSBodG1sX2VsZW1lbnRzKGJhc19odG1sLCAidGFibGUiKQp0YWJsZXMKYGBgCgojIFN0ZXAgMzogUGFyc2UgIEhUTUwgVGFibGUgaW50byB0aWJibGVzCmBgYHtyfQp0aWJibGVfbGlzdCA8LSBodG1sX3RhYmxlKHRhYmxlc1szXSkKY2FzdF90aWJibGUgPC0gdGliYmxlX2xpc3RbWzFdXQpjYXN0X3RpYmJsZQpgYGAKCgpgYGB7cn0Kd2lraV9odG1sIDwtIHJlYWRfaHRtbCgiaHR0cHM6Ly93d3cuaW1kYi5jb20vdGl0bGUvdHQ3MjM1NDY2L2Z1bGxjcmVkaXRzP3JlZl89dHRfY2xfc20iKQoKcGVyX2h0bWwgPC0gaHRtbF9lbGVtZW50KHdpa2lfaHRtbCwiI2Z1bGxjcmVkaXRzX2NvbnRlbnQgPiB0YWJsZS5jYXN0X2xpc3QiKQoKcGVyIDwtIGh0bWxfdGFibGUocGVyX2h0bWwpCnBlcgpgYGAKCmBgYHtyfQoKYGBgCgo=