Exercise 2.
The data comes from https://flixgem.com/ (dataset version as
of March 12, 2021). The data contains information on 9425 movies and
series available on Netlix.
Answer with the most appropriate data visualization for the following
questions:
- What is the distribution of Imdb scores for Polish movies and
movie-series?
#Create Polsih Movies
polish_movies <- netflixData %>%
filter(Languages == "Polish", Series.or.Movie == "Movie")
#Create Polsih Series
polish_series <- netflixData %>%
filter(Languages == "Polish", Series.or.Movie == "Series")
# Plot histograms for IMDb scores for Polish movies and series
ggplot() +
geom_histogram(data = polish_movies, aes(x = IMDb.Score), fill = "skyblue", alpha = 0.7) +
geom_histogram(data = polish_series, aes(x = IMDb.Score), fill = "darkgreen", alpha = 0.7) +
labs(title = "IMDb Scores for Polish Movies and Series",
x = "IMDb Scores",
y = "Frequency") +
theme_minimal()

- What is the density function of Imdb scores for Polish movies and
movie-series?
# Plot density for IMDb scores for Polish movies and series
ggplot() +
geom_density(data = polish_movies, aes(x = IMDb.Score, fill = "Polish Movie"), alpha = 0.7) +
geom_density(data = polish_series, aes(x = IMDb.Score, fill = "Polish Series"), alpha = 0.7) +
labs(title = "Density Function of IMDb Scores for Polish Movies and Series",
x = "IMDb Scores",
y = "Density") +
scale_fill_manual(values = c("darkred", "cyan")) +
theme_minimal()

- What are the most popular languages available on Netflix?
We added a photo created on https://flixgem.com/
For extra credits:
Extra challenge 1.: Create a chart showing actors starring
in the most popular productions.
# Filter out non-actor entries and aggregate data by actors
actor_popularity <- netflixData %>%
filter(!is.na(Actors) & !Actors %in% c("", "111")) %>% # there was an error that i don't know how not otherwise resolve that a bar "111" would appear out of nowhere...
separate_rows(Actors, sep = ", ") %>%
group_by(Actors) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
slice_head(n = 10)
# Create a bar chart
ggplot(actor_popularity, aes(x = reorder(Actors, -count), y = count, fill = Actors)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Actors Starring in popular productions on Netflix",
x = "Actor",
y = "Number of Productions") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Extra challenge 2.: For movies and series, create rating
charts from the various portals (Hidden Gem, IMDb, Rotten Tomatoes,
Metacritic). Hint: it’s a good idea to reshape the data to long
format.
# Filtering out movies and series
netflix_filtered <- netflixData %>%
filter(Series.or.Movie %in% c("Movie", "Series"))
# Reshaping the data to long format for ratings
ratings_long <- netflix_filtered %>%
select(Title, Series.or.Movie, Hidden.Gem.Score, IMDb.Score, `Rotten.Tomatoes.Score`, `Metacritic.Score`) %>%
pivot_longer(cols = c(Hidden.Gem.Score, IMDb.Score, `Rotten.Tomatoes.Score`, `Metacritic.Score`),
names_to = "Rating_Source",
values_to = "Rating") %>%
filter(!is.na(Rating))
# Hidden Gem Score chart
ggplot(ratings_long %>% filter(Rating_Source == "Hidden.Gem.Score"),
aes(x = Rating, fill = Rating_Source)) +
geom_histogram(binwidth = 0.5, alpha = 0.7) +
scale_fill_manual(values = c("Hidden.Gem.Score" = "blue")) +
facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
labs(title = "Hidden Gem Score Distribution",
x = "Hidden Gem Score",
y = "Count") +
theme_minimal()

# IMDb Score chart with unique color
ggplot(ratings_long %>% filter(Rating_Source == "IMDb.Score"),
aes(x = Rating, fill = Rating_Source)) +
geom_histogram(binwidth = 0.5, alpha = 0.7) +
scale_fill_manual(values = c("IMDb.Score" = "green")) +
facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
labs(title = "IMDb Score Distribution",
x = "IMDb Score",
y = "Count") +
theme_minimal()

# Rotten Tomatoes Score chart
ggplot(ratings_long %>% filter(Rating_Source == "Rotten.Tomatoes.Score"),
aes(x = Rating, fill = Rating_Source)) +
geom_histogram(binwidth = 0.5, alpha = 0.7) +
scale_fill_manual(values = c("Rotten.Tomatoes.Score" = "red")) +
facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
labs(title = "Rotten Tomatoes Score Distribution",
x = "Rotten Tomatoes Score",
y = "Count") +
theme_minimal()

# Metacritic Score chart
ggplot(ratings_long %>% filter(Rating_Source == "Metacritic.Score"),
aes(x = Rating, fill = Rating_Source)) +
geom_histogram(binwidth = 0.5, alpha = 0.7) +
scale_fill_manual(values = c("Metacritic.Score" = "purple")) +
facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
labs(title = "Metacritic Score Distribution",
x = "Metacritic Score",
y = "Count") +
theme_minimal()

Extra challenge 3.: Which film studios produce the most and
how has this changed over the years?
Comment from Stanisław: unfortunately not done to well, because the
individual Production.House is overshadoved by the overall count and I
dont know how to fix it. I tried finding the most dominant house first
then plotting just it, but it didn’t quite work, so I left this because
it at least shows that netflix was dominant.
# Aggregating data by Production.House
production_studios <- netflix_filtered %>%
group_by(Production.House, Release.Year = lubridate::year(as.Date(Release.Date))) %>%
summarise(count = n()) %>%
filter(!is.na(Release.Year)) %>%
arrange(desc(count))
# Creating a line chart to show the trend with labels for each production house
ggplot(production_studios, aes(x = Release.Year, y = count, group = Production.House, color = Production.House, label = Production.House)) +
geom_line() +
geom_text(aes(label = Production.House), hjust = -0.2, vjust = 0.5, size = 3, color = "black", check_overlap = TRUE) +
labs(title = "Production Trend by Film Studios Over Years",
x = "Release Year",
y = "Number of Productions") +
theme_minimal() +
theme(legend.position = "none")

---
title: "Data Visualization"
author: "Tomasz Kruczalak, Krzysica Stanisław, Hatice Tatli"
output:
  html_document: 
    theme: cerulean
    highlight: textmate
    fontsize: 8pt
    toc: yes
    code_download: yes
    toc_float:
      collapsed: no
    df_print: default
    toc_depth: 5
editor_options: 
  markdown: 
    wrap: 72
---



```{r, include=FALSE}
## Global options
options(qwraps2_markup = "markdown")
library(dplyr)
library(knitr)
library(ggplot2)
library(tidyr)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
```

## Exercise 1. 

Using data on credit card applications' status please present the frequency table with the nice, kable format for average monthly credit card expenditures of applicants.

```{r ex1}
library(dplyr)
library(knitr)
library(ggplot2)
library(tidyr)

df <- read.csv(file = "https://raw.githubusercontent.com/nnaemeka-git/payment/268bd51d03488e5c388231fa038fe0f86635433b/CreditCard.csv")

# Displaying the first rows:
head(df)

# Creating a Frequency Table with Cases:
freq_table <- df %>%
  group_by(card) %>%
  summarize(
    yes_count = sum(case_when(owner == "yes" ~ 1, owner == "no" ~ 0)),
    no_count = sum(case_when(owner == "yes" ~ 0, owner == "no" ~ 1))
  )

# Displaying the Frequency Table with Kable Method
knitr::kable(freq_table, format = "markdown", col.names = c("Card Situation", "OWNER:Y", "OWNER:N"))

```


## Exercise 2.

The data comes from [https://flixgem.com/](https://flixgem.com/) (dataset version as of March 12, 2021). The data contains information on 9425 movies and series available on Netlix.

```{r ex2, message=FALSE, warning=FALSE, include=FALSE}
download.file("https://raw.githubusercontent.com/kflisikowski/ds/master/netflix-dataset.csv?raw=true", destfile ="dane.csv",mode="wb")
netflixData<-read.csv(file="dane.csv",encoding ="UTF-8",header=TRUE,sep = ",")
attach(netflixData)

```

Answer with the most appropriate data visualization for the following questions:

1. What is the distribution of Imdb scores for Polish movies and movie-series?

```{r ex2_1}
#Create Polsih Movies
polish_movies <- netflixData %>%
  filter(Languages == "Polish", Series.or.Movie == "Movie")

#Create Polsih Series
polish_series <- netflixData %>%
  filter(Languages == "Polish", Series.or.Movie == "Series")


# Plot histograms for IMDb scores for Polish movies and series
ggplot() +
  geom_histogram(data = polish_movies, aes(x = IMDb.Score), fill = "skyblue", alpha = 0.7) +
  geom_histogram(data = polish_series, aes(x = IMDb.Score), fill = "darkgreen", alpha = 0.7) +
  labs(title = "IMDb Scores for Polish Movies and Series",
       x = "IMDb Scores",
       y = "Frequency") +
  theme_minimal()
```

2. What is the density function of Imdb scores for Polish movies and movie-series?

```{r ex2_2}
# Plot density for IMDb scores for Polish movies and series
ggplot() +
  geom_density(data = polish_movies, aes(x = IMDb.Score, fill = "Polish Movie"), alpha = 0.7) +
  geom_density(data = polish_series, aes(x = IMDb.Score, fill = "Polish Series"), alpha = 0.7) +
  labs(title = "Density Function of IMDb Scores for Polish Movies and Series",
       x = "IMDb Scores",
       y = "Density") +
  scale_fill_manual(values = c("darkred", "cyan")) +
  theme_minimal()
```

3. What are the most popular languages available on Netflix?

We added a photo created on https://flixgem.com/

For extra credits:

*Extra challenge 1.*: Create a chart showing actors starring in the most popular productions.

```{r challenge1}
# Filter out non-actor entries and aggregate data by actors
actor_popularity <- netflixData %>%
  filter(!is.na(Actors) & !Actors %in% c("", "111")) %>% # there was an error that i don't know how not otherwise resolve that a bar "111" would appear out of nowhere...
  separate_rows(Actors, sep = ", ") %>%
  group_by(Actors) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  slice_head(n = 10)

# Create a bar chart
ggplot(actor_popularity, aes(x = reorder(Actors, -count), y = count, fill = Actors)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 10 Actors Starring in popular productions on Netflix",
       x = "Actor",
       y = "Number of Productions") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```


*Extra challenge 2.*: For movies and series, create rating charts from the various portals (Hidden Gem, IMDb, Rotten Tomatoes, Metacritic). Hint: it's a good idea to reshape the data to *long* format.

```{r challenge2}
# Filtering out movies and series
netflix_filtered <- netflixData %>%
  filter(Series.or.Movie %in% c("Movie", "Series"))

# Reshaping the data to long format for ratings
ratings_long <- netflix_filtered %>%
  select(Title, Series.or.Movie, Hidden.Gem.Score, IMDb.Score, `Rotten.Tomatoes.Score`, `Metacritic.Score`) %>%
  pivot_longer(cols = c(Hidden.Gem.Score, IMDb.Score, `Rotten.Tomatoes.Score`, `Metacritic.Score`), 
               names_to = "Rating_Source", 
               values_to = "Rating") %>%
  filter(!is.na(Rating))

# Hidden Gem Score chart
ggplot(ratings_long %>% filter(Rating_Source == "Hidden.Gem.Score"), 
       aes(x = Rating, fill = Rating_Source)) +
  geom_histogram(binwidth = 0.5, alpha = 0.7) +
  scale_fill_manual(values = c("Hidden.Gem.Score" = "blue")) +
  facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
  labs(title = "Hidden Gem Score Distribution",
       x = "Hidden Gem Score",
       y = "Count") +
  theme_minimal()

# IMDb Score chart with unique color
ggplot(ratings_long %>% filter(Rating_Source == "IMDb.Score"), 
       aes(x = Rating, fill = Rating_Source)) +
  geom_histogram(binwidth = 0.5, alpha = 0.7) +
  scale_fill_manual(values = c("IMDb.Score" = "green")) +
  facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
  labs(title = "IMDb Score Distribution",
       x = "IMDb Score",
       y = "Count") +
  theme_minimal()

# Rotten Tomatoes Score chart 
ggplot(ratings_long %>% filter(Rating_Source == "Rotten.Tomatoes.Score"), 
       aes(x = Rating, fill = Rating_Source)) +
  geom_histogram(binwidth = 0.5, alpha = 0.7) +
  scale_fill_manual(values = c("Rotten.Tomatoes.Score" = "red")) +
  facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
  labs(title = "Rotten Tomatoes Score Distribution",
       x = "Rotten Tomatoes Score",
       y = "Count") +
  theme_minimal()

# Metacritic Score chart 
ggplot(ratings_long %>% filter(Rating_Source == "Metacritic.Score"), 
       aes(x = Rating, fill = Rating_Source)) +
  geom_histogram(binwidth = 0.5, alpha = 0.7) +
  scale_fill_manual(values = c("Metacritic.Score" = "purple")) +
  facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
  labs(title = "Metacritic Score Distribution",
       x = "Metacritic Score",
       y = "Count") +
  theme_minimal()
```

*Extra challenge 3.*: Which film studios produce the most and how has this changed over the years?

Comment from Stanisław: unfortunately not done to well, because the individual Production.House is overshadoved by the overall count and I dont know how to fix it. I tried finding the most dominant house first then plotting just it, but it didn't quite work, so I left this because it at least shows that netflix was dominant.

```{r challenge3}

# Aggregating data by Production.House
production_studios <- netflix_filtered %>%
  group_by(Production.House, Release.Year = lubridate::year(as.Date(Release.Date))) %>%
  summarise(count = n()) %>%
  filter(!is.na(Release.Year)) %>%
  arrange(desc(count))

# Creating a line chart to show the trend with labels for each production house
ggplot(production_studios, aes(x = Release.Year, y = count, group = Production.House, color = Production.House, label = Production.House)) +
  geom_line() +
  geom_text(aes(label = Production.House), hjust = -0.2, vjust = 0.5, size = 3, color = "black", check_overlap = TRUE) +
  labs(title = "Production Trend by Film Studios Over Years",
       x = "Release Year",
       y = "Number of Productions") +
  theme_minimal() +
  theme(legend.position = "none")

```