Exercise 1.

Using data on credit card applications’ status please present the frequency table with the nice, kable format for average monthly credit card expenditures of applicants.

library(dplyr)
library(knitr)
library(ggplot2)
library(tidyr)

df <- read.csv(file = "https://raw.githubusercontent.com/nnaemeka-git/payment/268bd51d03488e5c388231fa038fe0f86635433b/CreditCard.csv")

# Displaying the first rows:
head(df)
##   X card reports      age income       share expenditure owner selfemp
## 1 1  yes       0 37.66667 4.5200 0.033269910  124.983300   yes      no
## 2 2  yes       0 33.25000 2.4200 0.005216942    9.854167    no      no
## 3 3  yes       0 33.66667 4.5000 0.004155556   15.000000   yes      no
## 4 4  yes       0 30.50000 2.5400 0.065213780  137.869200    no      no
## 5 5  yes       0 32.16667 9.7867 0.067050590  546.503300   yes      no
## 6 6  yes       0 23.25000 2.5000 0.044438400   91.996670    no      no
##   dependents months majorcards active
## 1          3     54          1     12
## 2          3     34          1     13
## 3          4     58          1      5
## 4          0     25          1      7
## 5          2     64          1      5
## 6          0     54          1      1
# Creating a Frequency Table with Cases:
freq_table <- df %>%
  group_by(card) %>%
  summarize(
    yes_count = sum(case_when(owner == "yes" ~ 1, owner == "no" ~ 0)),
    no_count = sum(case_when(owner == "yes" ~ 0, owner == "no" ~ 1))
  )

# Displaying the Frequency Table with Kable Method
knitr::kable(freq_table, format = "markdown", col.names = c("Card Situation", "OWNER:Y", "OWNER:N"))
Card Situation OWNER:Y OWNER:N
no 90 206
yes 491 532

Exercise 2.

The data comes from https://flixgem.com/ (dataset version as of March 12, 2021). The data contains information on 9425 movies and series available on Netlix.

Answer with the most appropriate data visualization for the following questions:

  1. What is the distribution of Imdb scores for Polish movies and movie-series?
#Create Polsih Movies
polish_movies <- netflixData %>%
  filter(Languages == "Polish", Series.or.Movie == "Movie")

#Create Polsih Series
polish_series <- netflixData %>%
  filter(Languages == "Polish", Series.or.Movie == "Series")


# Plot histograms for IMDb scores for Polish movies and series
ggplot() +
  geom_histogram(data = polish_movies, aes(x = IMDb.Score), fill = "skyblue", alpha = 0.7) +
  geom_histogram(data = polish_series, aes(x = IMDb.Score), fill = "darkgreen", alpha = 0.7) +
  labs(title = "IMDb Scores for Polish Movies and Series",
       x = "IMDb Scores",
       y = "Frequency") +
  theme_minimal()

  1. What is the density function of Imdb scores for Polish movies and movie-series?
# Plot density for IMDb scores for Polish movies and series
ggplot() +
  geom_density(data = polish_movies, aes(x = IMDb.Score, fill = "Polish Movie"), alpha = 0.7) +
  geom_density(data = polish_series, aes(x = IMDb.Score, fill = "Polish Series"), alpha = 0.7) +
  labs(title = "Density Function of IMDb Scores for Polish Movies and Series",
       x = "IMDb Scores",
       y = "Density") +
  scale_fill_manual(values = c("darkred", "cyan")) +
  theme_minimal()

  1. What are the most popular languages available on Netflix?

We added a photo created on https://flixgem.com/

For extra credits:

Extra challenge 1.: Create a chart showing actors starring in the most popular productions.

# Filter out non-actor entries and aggregate data by actors
actor_popularity <- netflixData %>%
  filter(!is.na(Actors) & !Actors %in% c("", "111")) %>% # there was an error that i don't know how not otherwise resolve that a bar "111" would appear out of nowhere...
  separate_rows(Actors, sep = ", ") %>%
  group_by(Actors) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  slice_head(n = 10)

# Create a bar chart
ggplot(actor_popularity, aes(x = reorder(Actors, -count), y = count, fill = Actors)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 10 Actors Starring in popular productions on Netflix",
       x = "Actor",
       y = "Number of Productions") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Extra challenge 2.: For movies and series, create rating charts from the various portals (Hidden Gem, IMDb, Rotten Tomatoes, Metacritic). Hint: it’s a good idea to reshape the data to long format.

# Filtering out movies and series
netflix_filtered <- netflixData %>%
  filter(Series.or.Movie %in% c("Movie", "Series"))

# Reshaping the data to long format for ratings
ratings_long <- netflix_filtered %>%
  select(Title, Series.or.Movie, Hidden.Gem.Score, IMDb.Score, `Rotten.Tomatoes.Score`, `Metacritic.Score`) %>%
  pivot_longer(cols = c(Hidden.Gem.Score, IMDb.Score, `Rotten.Tomatoes.Score`, `Metacritic.Score`), 
               names_to = "Rating_Source", 
               values_to = "Rating") %>%
  filter(!is.na(Rating))

# Hidden Gem Score chart
ggplot(ratings_long %>% filter(Rating_Source == "Hidden.Gem.Score"), 
       aes(x = Rating, fill = Rating_Source)) +
  geom_histogram(binwidth = 0.5, alpha = 0.7) +
  scale_fill_manual(values = c("Hidden.Gem.Score" = "blue")) +
  facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
  labs(title = "Hidden Gem Score Distribution",
       x = "Hidden Gem Score",
       y = "Count") +
  theme_minimal()

# IMDb Score chart with unique color
ggplot(ratings_long %>% filter(Rating_Source == "IMDb.Score"), 
       aes(x = Rating, fill = Rating_Source)) +
  geom_histogram(binwidth = 0.5, alpha = 0.7) +
  scale_fill_manual(values = c("IMDb.Score" = "green")) +
  facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
  labs(title = "IMDb Score Distribution",
       x = "IMDb Score",
       y = "Count") +
  theme_minimal()

# Rotten Tomatoes Score chart 
ggplot(ratings_long %>% filter(Rating_Source == "Rotten.Tomatoes.Score"), 
       aes(x = Rating, fill = Rating_Source)) +
  geom_histogram(binwidth = 0.5, alpha = 0.7) +
  scale_fill_manual(values = c("Rotten.Tomatoes.Score" = "red")) +
  facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
  labs(title = "Rotten Tomatoes Score Distribution",
       x = "Rotten Tomatoes Score",
       y = "Count") +
  theme_minimal()

# Metacritic Score chart 
ggplot(ratings_long %>% filter(Rating_Source == "Metacritic.Score"), 
       aes(x = Rating, fill = Rating_Source)) +
  geom_histogram(binwidth = 0.5, alpha = 0.7) +
  scale_fill_manual(values = c("Metacritic.Score" = "purple")) +
  facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
  labs(title = "Metacritic Score Distribution",
       x = "Metacritic Score",
       y = "Count") +
  theme_minimal()

Extra challenge 3.: Which film studios produce the most and how has this changed over the years?

Comment from Stanisław: unfortunately not done to well, because the individual Production.House is overshadoved by the overall count and I dont know how to fix it. I tried finding the most dominant house first then plotting just it, but it didn’t quite work, so I left this because it at least shows that netflix was dominant.

# Aggregating data by Production.House
production_studios <- netflix_filtered %>%
  group_by(Production.House, Release.Year = lubridate::year(as.Date(Release.Date))) %>%
  summarise(count = n()) %>%
  filter(!is.na(Release.Year)) %>%
  arrange(desc(count))

# Creating a line chart to show the trend with labels for each production house
ggplot(production_studios, aes(x = Release.Year, y = count, group = Production.House, color = Production.House, label = Production.House)) +
  geom_line() +
  geom_text(aes(label = Production.House), hjust = -0.2, vjust = 0.5, size = 3, color = "black", check_overlap = TRUE) +
  labs(title = "Production Trend by Film Studios Over Years",
       x = "Release Year",
       y = "Number of Productions") +
  theme_minimal() +
  theme(legend.position = "none")

---
title: "Data Visualization"
author: "Tomasz Kruczalak, Krzysica Stanisław, Hatice Tatli"
output:
  html_document: 
    theme: cerulean
    highlight: textmate
    fontsize: 8pt
    toc: yes
    code_download: yes
    toc_float:
      collapsed: no
    df_print: default
    toc_depth: 5
editor_options: 
  markdown: 
    wrap: 72
---



```{r, include=FALSE}
## Global options
options(qwraps2_markup = "markdown")
library(dplyr)
library(knitr)
library(ggplot2)
library(tidyr)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
```

## Exercise 1. 

Using data on credit card applications' status please present the frequency table with the nice, kable format for average monthly credit card expenditures of applicants.

```{r ex1}
library(dplyr)
library(knitr)
library(ggplot2)
library(tidyr)

df <- read.csv(file = "https://raw.githubusercontent.com/nnaemeka-git/payment/268bd51d03488e5c388231fa038fe0f86635433b/CreditCard.csv")

# Displaying the first rows:
head(df)

# Creating a Frequency Table with Cases:
freq_table <- df %>%
  group_by(card) %>%
  summarize(
    yes_count = sum(case_when(owner == "yes" ~ 1, owner == "no" ~ 0)),
    no_count = sum(case_when(owner == "yes" ~ 0, owner == "no" ~ 1))
  )

# Displaying the Frequency Table with Kable Method
knitr::kable(freq_table, format = "markdown", col.names = c("Card Situation", "OWNER:Y", "OWNER:N"))

```


## Exercise 2.

The data comes from [https://flixgem.com/](https://flixgem.com/) (dataset version as of March 12, 2021). The data contains information on 9425 movies and series available on Netlix.

```{r ex2, message=FALSE, warning=FALSE, include=FALSE}
download.file("https://raw.githubusercontent.com/kflisikowski/ds/master/netflix-dataset.csv?raw=true", destfile ="dane.csv",mode="wb")
netflixData<-read.csv(file="dane.csv",encoding ="UTF-8",header=TRUE,sep = ",")
attach(netflixData)

```

Answer with the most appropriate data visualization for the following questions:

1. What is the distribution of Imdb scores for Polish movies and movie-series?

```{r ex2_1}
#Create Polsih Movies
polish_movies <- netflixData %>%
  filter(Languages == "Polish", Series.or.Movie == "Movie")

#Create Polsih Series
polish_series <- netflixData %>%
  filter(Languages == "Polish", Series.or.Movie == "Series")


# Plot histograms for IMDb scores for Polish movies and series
ggplot() +
  geom_histogram(data = polish_movies, aes(x = IMDb.Score), fill = "skyblue", alpha = 0.7) +
  geom_histogram(data = polish_series, aes(x = IMDb.Score), fill = "darkgreen", alpha = 0.7) +
  labs(title = "IMDb Scores for Polish Movies and Series",
       x = "IMDb Scores",
       y = "Frequency") +
  theme_minimal()
```

2. What is the density function of Imdb scores for Polish movies and movie-series?

```{r ex2_2}
# Plot density for IMDb scores for Polish movies and series
ggplot() +
  geom_density(data = polish_movies, aes(x = IMDb.Score, fill = "Polish Movie"), alpha = 0.7) +
  geom_density(data = polish_series, aes(x = IMDb.Score, fill = "Polish Series"), alpha = 0.7) +
  labs(title = "Density Function of IMDb Scores for Polish Movies and Series",
       x = "IMDb Scores",
       y = "Density") +
  scale_fill_manual(values = c("darkred", "cyan")) +
  theme_minimal()
```

3. What are the most popular languages available on Netflix?

We added a photo created on https://flixgem.com/

For extra credits:

*Extra challenge 1.*: Create a chart showing actors starring in the most popular productions.

```{r challenge1}
# Filter out non-actor entries and aggregate data by actors
actor_popularity <- netflixData %>%
  filter(!is.na(Actors) & !Actors %in% c("", "111")) %>% # there was an error that i don't know how not otherwise resolve that a bar "111" would appear out of nowhere...
  separate_rows(Actors, sep = ", ") %>%
  group_by(Actors) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  slice_head(n = 10)

# Create a bar chart
ggplot(actor_popularity, aes(x = reorder(Actors, -count), y = count, fill = Actors)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 10 Actors Starring in popular productions on Netflix",
       x = "Actor",
       y = "Number of Productions") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```


*Extra challenge 2.*: For movies and series, create rating charts from the various portals (Hidden Gem, IMDb, Rotten Tomatoes, Metacritic). Hint: it's a good idea to reshape the data to *long* format.

```{r challenge2}
# Filtering out movies and series
netflix_filtered <- netflixData %>%
  filter(Series.or.Movie %in% c("Movie", "Series"))

# Reshaping the data to long format for ratings
ratings_long <- netflix_filtered %>%
  select(Title, Series.or.Movie, Hidden.Gem.Score, IMDb.Score, `Rotten.Tomatoes.Score`, `Metacritic.Score`) %>%
  pivot_longer(cols = c(Hidden.Gem.Score, IMDb.Score, `Rotten.Tomatoes.Score`, `Metacritic.Score`), 
               names_to = "Rating_Source", 
               values_to = "Rating") %>%
  filter(!is.na(Rating))

# Hidden Gem Score chart
ggplot(ratings_long %>% filter(Rating_Source == "Hidden.Gem.Score"), 
       aes(x = Rating, fill = Rating_Source)) +
  geom_histogram(binwidth = 0.5, alpha = 0.7) +
  scale_fill_manual(values = c("Hidden.Gem.Score" = "blue")) +
  facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
  labs(title = "Hidden Gem Score Distribution",
       x = "Hidden Gem Score",
       y = "Count") +
  theme_minimal()

# IMDb Score chart with unique color
ggplot(ratings_long %>% filter(Rating_Source == "IMDb.Score"), 
       aes(x = Rating, fill = Rating_Source)) +
  geom_histogram(binwidth = 0.5, alpha = 0.7) +
  scale_fill_manual(values = c("IMDb.Score" = "green")) +
  facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
  labs(title = "IMDb Score Distribution",
       x = "IMDb Score",
       y = "Count") +
  theme_minimal()

# Rotten Tomatoes Score chart 
ggplot(ratings_long %>% filter(Rating_Source == "Rotten.Tomatoes.Score"), 
       aes(x = Rating, fill = Rating_Source)) +
  geom_histogram(binwidth = 0.5, alpha = 0.7) +
  scale_fill_manual(values = c("Rotten.Tomatoes.Score" = "red")) +
  facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
  labs(title = "Rotten Tomatoes Score Distribution",
       x = "Rotten Tomatoes Score",
       y = "Count") +
  theme_minimal()

# Metacritic Score chart 
ggplot(ratings_long %>% filter(Rating_Source == "Metacritic.Score"), 
       aes(x = Rating, fill = Rating_Source)) +
  geom_histogram(binwidth = 0.5, alpha = 0.7) +
  scale_fill_manual(values = c("Metacritic.Score" = "purple")) +
  facet_wrap(~Series.or.Movie, scales = "free", nrow = 2) +
  labs(title = "Metacritic Score Distribution",
       x = "Metacritic Score",
       y = "Count") +
  theme_minimal()
```

*Extra challenge 3.*: Which film studios produce the most and how has this changed over the years?

Comment from Stanisław: unfortunately not done to well, because the individual Production.House is overshadoved by the overall count and I dont know how to fix it. I tried finding the most dominant house first then plotting just it, but it didn't quite work, so I left this because it at least shows that netflix was dominant.

```{r challenge3}

# Aggregating data by Production.House
production_studios <- netflix_filtered %>%
  group_by(Production.House, Release.Year = lubridate::year(as.Date(Release.Date))) %>%
  summarise(count = n()) %>%
  filter(!is.na(Release.Year)) %>%
  arrange(desc(count))

# Creating a line chart to show the trend with labels for each production house
ggplot(production_studios, aes(x = Release.Year, y = count, group = Production.House, color = Production.House, label = Production.House)) +
  geom_line() +
  geom_text(aes(label = Production.House), hjust = -0.2, vjust = 0.5, size = 3, color = "black", check_overlap = TRUE) +
  labs(title = "Production Trend by Film Studios Over Years",
       x = "Release Year",
       y = "Number of Productions") +
  theme_minimal() +
  theme(legend.position = "none")

```