Load necessary packages
Read the dataset
cast <- read_csv("~/Downloads/raw_cast.csv")
## New names:
## Rows: 1298 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): Name 1, Name 2 dbl (1): ...1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Question 7
View first few rows and check structure and types
head(cast)
## # A tibble: 6 × 3
## ...1 `Name 1` `Name 2`
## <dbl> <chr> <chr>
## 1 1 Angela Bassett "Athena Grant\n 87 episodes, 2018-2022"
## 2 2 Peter Krause "Bobby Nash\n 87 episodes, 2018-2022"
## 3 3 Oliver Stark "Evan 'Buck' Buckley\n 87 episodes, 20…
## 4 4 Aisha Hinds "Henrietta 'Hen' Wilson\n 87 episodes,…
## 5 5 Kenneth Choi "Howie 'Chimney' Han\n 87 episodes, 20…
## 6 6 Corinne Massiah "May Grant\n 82 episodes, 2018-2022"
str(cast)
## spc_tbl_ [1,298 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ...1 : num [1:1298] 1 2 3 4 5 6 7 8 9 10 ...
## $ Name 1: chr [1:1298] "Angela Bassett" "Peter Krause" "Oliver Stark" "Aisha Hinds" ...
## $ Name 2: chr [1:1298] "Athena Grant\n 87 episodes, 2018-2022" "Bobby Nash\n 87 episodes, 2018-2022" "Evan 'Buck' Buckley\n 87 episodes, 2018-2022" "Henrietta 'Hen' Wilson\n 87 episodes, 2018-2022" ...
## - attr(*, "spec")=
## .. cols(
## .. ...1 = col_double(),
## .. `Name 1` = col_character(),
## .. `Name 2` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Summarize dataset
summary(cast)
## ...1 Name 1 Name 2
## Min. : 1.0 Length:1298 Length:1298
## 1st Qu.: 325.2 Class :character Class :character
## Median : 649.5 Mode :character Mode :character
## Mean : 649.5
## 3rd Qu.: 973.8
## Max. :1298.0
Check for missing values and identify duplicate rows
colSums(is.na(cast))
## ...1 Name 1 Name 2
## 0 118 118
sum(duplicated(cast))
## [1] 0
Questions 8 - 10
Split Name 2 into Name2 and Episode
cast <- cast %>%
mutate(Name2 = str_split_fixed(`Name 2`, "\n", 2)[,1],
Episode = str_split_fixed(`Name 2`, "\n", 2)[,2])
#str_subset(cast$Episode, "\n")
#str_replace_all(cast$Episode, "[^[:alnum:]]", " ")
cast$Episode <- str_replace_all(cast$Episode, "[[:punct:]]", "")
#str_trim(cast$Episode, side="both")
#head(cast[, c("Name 2", "Name2", "Episode")])
print(cast[, c("Name2", "Episode")])
## # A tibble: 1,298 × 2
## Name2 Episode
## <chr> <chr>
## 1 Athena Grant " 87 episodes 20182022"
## 2 Bobby Nash " 87 episodes 20182022"
## 3 Evan 'Buck' Buckley " 87 episodes 20182022"
## 4 Henrietta 'Hen' Wilson " 87 episodes 20182022"
## 5 Howie 'Chimney' Han " 87 episodes 20182022"
## 6 May Grant " 82 episodes 20182022"
## 7 Maddie Kendall " 77 episodes 20182022"
## 8 Eddie Diaz " 77 episodes 20182022"
## 9 Harry Grant " 74 episodes 20182022"
## 10 Michael Grant " 69 episodes 20182021"
## # ℹ 1,288 more rows