Exercise 2.
The data comes from https://flixgem.com/ (dataset version as
of March 12, 2021). The data contains information on 9425 movies and
series available on Netlix.
Answer with the most appropriate data visualization for the following
questions:
- What is the distribution of Imdb scores for Polish movies and
movie-series?
#Create Polsih Movies
polish_movies <- netflixData %>%
filter(Languages == "Polish", Series.or.Movie == "Movie")
#Create Polsih Series
polish_series <- netflixData %>%
filter(Languages == "Polish", Series.or.Movie == "Series")
# Plot histograms for IMDb scores for Polish movies and series
ggplot() +
geom_histogram(data = polish_movies, aes(x = IMDb.Score), fill = "skyblue", alpha = 0.7) +
geom_histogram(data = polish_series, aes(x = IMDb.Score), fill = "darkgreen", alpha = 0.7) +
labs(title = "IMDb Scores for Polish Movies and Series",
x = "IMDb Scores",
y = "Frequency") +
theme_minimal()

- What is the density function of Imdb scores for Polish movies and
movie-series?
# Plot density for IMDb scores for Polish movies and series
ggplot() +
geom_density(data = polish_movies, aes(x = IMDb.Score, fill = "Polish Movie"), alpha = 0.7) +
geom_density(data = polish_series, aes(x = IMDb.Score, fill = "Polish Series"), alpha = 0.7) +
labs(title = "Density Function of IMDb Scores for Polish Movies and Series",
x = "IMDb Scores",
y = "Density") +
scale_fill_manual(values = c("darkred", "cyan")) +
theme_minimal()

- What are the most popular languages available on Netflix?
We added a photo created on https://flixgem.com/
For extra credits:
Extra challenge 1.: Create a chart showing actors starring
in the most popular productions.
# Filter out non-actor entries and aggregate data by actors
actor_popularity <- netflixData %>%
filter(!is.na(Actors) & !Actors %in% c("", "111")) %>% # there was an error that i don't know how not otherwise resolve that a bar "111" would appear out of nowhere...
separate_rows(Actors, sep = ", ") %>%
group_by(Actors) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
slice_head(n = 10)
# Create a bar chart
ggplot(actor_popularity, aes(x = reorder(Actors, -count), y = count, fill = Actors)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Actors Starring in popular productions on Netflix",
x = "Actor",
y = "Number of Productions") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Extra challenge 2.: For movies and series, create rating
charts from the various portals (Hidden Gem, IMDb, Rotten Tomatoes,
Metacritic). Hint: it’s a good idea to reshape the data to long
format.
# Filtering out movies and series
netflix_filtered <- netflixData %>%
filter(Series.or.Movie %in% c("Movie", "Series"))
# Reshaping the data to long format for ratings
ratings_long <- netflix_filtered %>%
select(Title, Series.or.Movie, Hidden.Gem.Score, IMDb.Score, `Rotten.Tomatoes.Score`, `Metacritic.Score`) %>%
pivot_longer(cols = c(Hidden.Gem.Score, IMDb.Score, `Rotten.Tomatoes.Score`, `Metacritic.Score`),
names_to = "Rating_Source",
values_to = "Rating") %>%
filter(!is.na(Rating))
# Hidden Gem Score chart
ggplot(ratings_long %>% filter(Rating_Source == "Hidden.Gem.Score"),
aes(x = Rating, fill = Rating_Source)) +
geom_histogram(binwidth = 0.5, alpha = 0.7) +
scale_fill_manual(values = c("Hidden.Gem.Score" = "blue")) +
facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
labs(title = "Hidden Gem Score Distribution",
x = "Hidden Gem Score",
y = "Count") +
theme_minimal()

# IMDb Score chart with unique color
ggplot(ratings_long %>% filter(Rating_Source == "IMDb.Score"),
aes(x = Rating, fill = Rating_Source)) +
geom_histogram(binwidth = 0.5, alpha = 0.7) +
scale_fill_manual(values = c("IMDb.Score" = "green")) +
facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
labs(title = "IMDb Score Distribution",
x = "IMDb Score",
y = "Count") +
theme_minimal()

# Rotten Tomatoes Score chart
ggplot(ratings_long %>% filter(Rating_Source == "Rotten.Tomatoes.Score"),
aes(x = Rating, fill = Rating_Source)) +
geom_histogram(binwidth = 0.5, alpha = 0.7) +
scale_fill_manual(values = c("Rotten.Tomatoes.Score" = "red")) +
facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
labs(title = "Rotten Tomatoes Score Distribution",
x = "Rotten Tomatoes Score",
y = "Count") +
theme_minimal()

# Metacritic Score chart
ggplot(ratings_long %>% filter(Rating_Source == "Metacritic.Score"),
aes(x = Rating, fill = Rating_Source)) +
geom_histogram(binwidth = 0.5, alpha = 0.7) +
scale_fill_manual(values = c("Metacritic.Score" = "purple")) +
facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
labs(title = "Metacritic Score Distribution",
x = "Metacritic Score",
y = "Count") +
theme_minimal()

Extra challenge 3.: Which film studios produce the most and
how has this changed over the years?
Comment from Stanisław: unfortunately not done to well, because the
individual Production.House is overshadoved by the overall count and I
dont know how to fix it. I tried finding the most dominant house first
then plotting just it, but it didn’t quite work, so I left this because
it at least shows that netflix was dominant.
# Aggregating data by Production.House
production_studios <- netflix_filtered %>%
group_by(Production.House, Release.Year = lubridate::year(as.Date(Release.Date))) %>%
summarise(count = n()) %>%
filter(!is.na(Release.Year)) %>%
arrange(desc(count))
# Creating a line chart to show the trend with labels for each production house
ggplot(production_studios, aes(x = Release.Year, y = count, group = Production.House, color = Production.House, label = Production.House)) +
geom_line() +
geom_text(aes(label = Production.House), hjust = -0.2, vjust = 0.5, size = 3, color = "black", check_overlap = TRUE) +
labs(title = "Production Trend by Film Studios Over Years",
x = "Release Year",
y = "Number of Productions") +
theme_minimal() +
theme(legend.position = "none")

