Load necessary packages

Read the dataset

cast <- read_csv("~/Downloads/raw_cast.csv")
## New names:
## Rows: 1298 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): Name 1, Name 2 dbl (1): ...1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Question 7

View first few rows and check structure and types

head(cast)
## # A tibble: 6 × 3
##    ...1 `Name 1`        `Name 2`                                                
##   <dbl> <chr>           <chr>                                                   
## 1     1 Angela Bassett  "Athena Grant\n                  87 episodes, 2018-2022"
## 2     2 Peter Krause    "Bobby Nash\n                  87 episodes, 2018-2022"  
## 3     3 Oliver Stark    "Evan 'Buck' Buckley\n                  87 episodes, 20…
## 4     4 Aisha Hinds     "Henrietta 'Hen' Wilson\n                  87 episodes,…
## 5     5 Kenneth Choi    "Howie 'Chimney' Han\n                  87 episodes, 20…
## 6     6 Corinne Massiah "May Grant\n                  82 episodes, 2018-2022"
str(cast)
## spc_tbl_ [1,298 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ...1  : num [1:1298] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Name 1: chr [1:1298] "Angela Bassett" "Peter Krause" "Oliver Stark" "Aisha Hinds" ...
##  $ Name 2: chr [1:1298] "Athena Grant\n                  87 episodes, 2018-2022" "Bobby Nash\n                  87 episodes, 2018-2022" "Evan 'Buck' Buckley\n                  87 episodes, 2018-2022" "Henrietta 'Hen' Wilson\n                  87 episodes, 2018-2022" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ...1 = col_double(),
##   ..   `Name 1` = col_character(),
##   ..   `Name 2` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Summarize dataset

summary(cast)
##       ...1           Name 1             Name 2         
##  Min.   :   1.0   Length:1298        Length:1298       
##  1st Qu.: 325.2   Class :character   Class :character  
##  Median : 649.5   Mode  :character   Mode  :character  
##  Mean   : 649.5                                        
##  3rd Qu.: 973.8                                        
##  Max.   :1298.0

Check for missing values and identify duplicate rows

colSums(is.na(cast))
##   ...1 Name 1 Name 2 
##      0    118    118
sum(duplicated(cast))
## [1] 0

Questions 8 - 10

Split Name 2 into Name2 and Episode

cast <- cast %>%
  mutate(Name2 = str_split_fixed(`Name 2`, "\n", 2)[,1], 
         Episode = str_split_fixed(`Name 2`, "\n", 2)[,2])
#str_subset(cast$Episode, "\n")
#str_replace_all(cast$Episode, "[^[:alnum:]]", " ")
cast$Episode <- str_replace_all(cast$Episode, "[[:punct:]]", "")
#str_trim(cast$Episode, side="both")
#head(cast[, c("Name 2", "Name2", "Episode")])
print(cast[, c("Name2", "Episode")])
## # A tibble: 1,298 × 2
##    Name2                  Episode                                 
##    <chr>                  <chr>                                   
##  1 Athena Grant           "                  87 episodes 20182022"
##  2 Bobby Nash             "                  87 episodes 20182022"
##  3 Evan 'Buck' Buckley    "                  87 episodes 20182022"
##  4 Henrietta 'Hen' Wilson "                  87 episodes 20182022"
##  5 Howie 'Chimney' Han    "                  87 episodes 20182022"
##  6 May Grant              "                  82 episodes 20182022"
##  7 Maddie Kendall         "                  77 episodes 20182022"
##  8 Eddie Diaz             "                  77 episodes 20182022"
##  9 Harry Grant            "                  74 episodes 20182022"
## 10 Michael Grant          "                  69 episodes 20182021"
## # ℹ 1,288 more rows