library(tidyverse)
# Read the Netflix dataset
Netflix <- read.csv("Netflix.csv")
# Separate actors in the cast column and rename the column
Netflix_Actor <- Netflix %>%
separate_rows(cast, sep = ", ") %>%
drop_na(cast) %>%
rename(actor = cast)
# Display the transformed data structure
head(Netflix_Actor)
## # A tibble: 6 × 12
## show_id type title director actor country date_added release_year rating
## <int> <chr> <chr> <chr> <chr> <chr> <chr> <int> <chr>
## 1 81145628 Movie Norm of … Richard… Alan… United… September… 2019 TV-PG
## 2 81145628 Movie Norm of … Richard… Andr… United… September… 2019 TV-PG
## 3 81145628 Movie Norm of … Richard… Bria… United… September… 2019 TV-PG
## 4 81145628 Movie Norm of … Richard… Cole… United… September… 2019 TV-PG
## 5 81145628 Movie Norm of … Richard… Jenn… United… September… 2019 TV-PG
## 6 81145628 Movie Norm of … Richard… Jona… United… September… 2019 TV-PG
## # ℹ 3 more variables: duration <chr>, listed_in <chr>, description <chr>
# Finding the 6 actors that have the most appearances on TV show
top_actors <- Netflix_Actor %>%
select(type, actor) %>%
filter(type == "TV Show") %>%
group_by(actor) %>%
count(sort = TRUE) %>%
ungroup() %>%
head(6)
# Display the results
print(top_actors)
## # A tibble: 6 × 2
## actor n
## <chr> <int>
## 1 "" 210
## 2 "Takahiro Sakurai" 18
## 3 "Yuki Kaji" 16
## 4 "Daisuke Ono" 14
## 5 "David Attenborough" 14
## 6 "Ashleigh Ball" 12
# Create a bar plot
ggplot(top_actors, aes(x = reorder(actor, n), y = n, fill = actor)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = n), hjust = -0.2, size = 4) +
coord_flip() +
labs(
title = "Top 6 Actors with Most Appearances in Netflix TV Shows",
x = "Actor",
y = "Number of Appearances"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
) +
scale_fill_brewer(palette = "Set3")