Assignment3.knit

library(robotstxt)

Url to check

url <- "https://www.imdb.com/title/tt7235466/fullcredits"

Path

is_allowed <- paths_allowed(url)

##  www.imdb.com

if (is_allowed) {
  print("Scraping is allowed for this page.")
} else {
  print("Scraping is NOT allowed for this page.")
}

## [1] "Scraping is allowed for this page."

library(rvest)
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)

Tables

tables <- html_nodes(webpage, "table")

3rd table

series_cast_table <- html_table(tables[3], fill = TRUE)

Rows and columns

dim(series_cast_table)

## NULL

library(rvest)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

url

url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"

Html content

webpage <- read_html(url)

Tables from page

tables <- html_nodes(webpage, "table")

3rd table

cast_table <- html_table(tables[[3]], fill = TRUE)

Cleaned

clean_cast <- cast_table %>%
  select(2, 4) %>%  
  filter(!is.na(X2) & X2 != "", !is.na(X4) & X4 != "")

Tail

clean_cast <- clean_cast %>% filter(X2 != "" & X4 != "")

Display Final

num_rows <- nrow(clean_cast)
num_columns <- ncol(clean_cast)
cat("Final cleaned dataset contains:", num_rows, "observations and", num_columns, "columns.\n")

## Final cleaned dataset contains: 1575 observations and 2 columns.

Display clean

head(clean_cast)

## # A tibble: 6 × 2
##   X2                   X4                                                       
##   <chr>                <chr>                                                    
## 1 Angela Bassett       "Athena Grant\n         / ...  \n                  115 e…
## 2 Peter Krause         "Bobby Nash\n                  115 episodes, 2018-2025"  
## 3 Oliver Stark         "Evan 'Buck' Buckley\n                  115 episodes, 20…
## 4 Aisha Hinds          "Henrietta 'Hen' Wilson\n                  115 episodes,…
## 5 Kenneth Choi         "Howie 'Chimney' Han\n                  115 episodes, 20…
## 6 Jennifer Love Hewitt "Maddie Kendall\n         / ...  \n                  105…