Data Visualization

Aleksandra Templin & Ksawery Raupuk

2024-04-16

EXERCISE 1

data(CreditCard)
top_ten_rows <- CreditCard %>%
  filter(months != 0) %>%
  mutate(AverageMonthlyExpenditure = expenditure / months) %>%
  arrange(desc(AverageMonthlyExpenditure)) %>%
  head(10)

kable_table <- kable(top_ten_rows, format = "html", 
                     caption = "Top 10 Average Monthly Credit Card Expenditures") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

kable_table
Top 10 Average Monthly Credit Card Expenditures
card reports age income share expenditure owner selfemp dependents months majorcards active AverageMonthlyExpenditure
472 yes 1 43.33333 4.80 0.3706127 1482.3680 yes no 0 3 1 4 494.1227
765 yes 0 44.50000 9.00 0.2599927 1949.8620 no no 2 4 1 7 487.4655
624 yes 0 27.16667 2.60 0.8778461 1902.0000 no no 0 4 1 7 475.5000
1292 yes 0 22.00000 1.75 0.6214691 906.3092 no no 0 2 1 0 453.1546
1181 yes 0 23.16667 2.60 0.8245508 1786.2770 no no 0 4 1 0 446.5693
874 yes 0 25.58333 2.80 0.3252597 758.9391 no no 0 2 1 7 379.4696
434 yes 0 28.91667 2.75 0.3822785 875.8884 yes no 1 3 1 3 291.9628
487 yes 0 22.91667 2.50 0.2395248 498.9267 no no 0 2 1 0 249.4633
763 yes 0 35.25000 6.00 0.4582348 2291.1740 no no 2 10 1 0 229.1174
113 yes 0 28.16667 2.40 0.1137675 227.3683 no no 0 1 1 29 227.3683

EXERCISE 2.1

data<-read.csv("https://raw.githubusercontent.com/kflisikowski/ds/master/netflix-dataset.csv?raw=true")
polish_films<- data[grep("Polish", data$Tags), ]
ggplot(polish_films, aes(x = IMDb.Score)) +
  geom_histogram(binwidth = 0.1, fill = "red", color = "black") +
  labs(title = "Distribution of Imdb scores for Polish movies and movies-series",
       x = "Imdb Score",
       y = "Quantity")

EXERCISE 2.2

ggplot(polish_films, aes(x = IMDb.Score)) +
  geom_density(fill = "red", color = "black", alpha = 0.5) +  
  labs(title = "Distribution of IMDb scores for Polish movies and movie-series",
       x = "IMDb Score",
       y = "Density")

EXERCISE 2.3

language_counts <- data %>%
  separate_rows(Languages, sep = ",\\s*") %>%
  count(Languages, name = "Count") %>%
  arrange(desc(Count))
ggplot(language_counts[1:10, ], aes(x = reorder(Languages, Count), y = Count)) +
  geom_bar(stat = "identity", fill = "black") +
  labs(title = "Top 10 Most Popular Languages on Netflix",
       x = "Languages",
       y = "Number of movies") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

CHALLENGE 1

seperated_actor_popularity <- data %>%
  separate_rows(Actors, sep = ",\\s*")

actor_popularity <- seperated_actor_popularity %>%
  group_by(Actors) %>%
  summarize(TotalPopularity = sum(Awards.Nominated.For, na.rm = TRUE)) %>%
  arrange(desc(TotalPopularity))

top_actors <- head(actor_popularity, 10)
ggplot(top_actors, aes(x = reorder(Actors, TotalPopularity), y = TotalPopularity, fill = Actors)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Actors starring in the most popular productions",
       x = "Actor",
       y = "movie's popularity") +
  theme_minimal() +
  theme(legend.position = "none")

CHALLENGE 2

# first graph with scale form 1 to 100
long_data1to100 <- data %>%
  pivot_longer(
    cols = c("Rotten.Tomatoes.Score", "Metacritic.Score"),
    names_to = "Rating_Source_From_1_to_100",
    values_to = "Score"
  ) %>%
  filter(is.finite(Score))

ggplot(long_data1to100, aes(x = Rating_Source_From_1_to_100, y = Score, fill = Rating_Source_From_1_to_100)) +
  geom_boxplot() +
  scale_fill_manual(values = c("Rotten.Tomatoes.Score" = "red", "Metacritic.Score" = "purple")) +
  labs(title = "Rating charts from the various portals scale from 1 to 100",
       x = "Portal",
       y = "Rate") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# second graph with scale form 1 to 10
long_data1to10 <- data %>%
  pivot_longer(
    cols = c("IMDb.Score", "Hidden.Gem.Score"),
    names_to = "Rating_Source_From_1_to_10",
    values_to = "Score"
  ) %>%
  filter(is.finite(Score))

ggplot(long_data1to10, aes(x = Rating_Source_From_1_to_10, y = Score, fill = Rating_Source_From_1_to_10)) +
  geom_boxplot() +
  scale_fill_manual(values = c("IMDb.Score" = "yellow", "Hidden.Gem.Score" = "black")) +
  labs(title = "Rating charts from the various portals scale from 1 to 10",
       x = "Portal",
       y = "Rate") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

CHALLENGE 3 - Which studio produced the most films?

# Count the frequency of each film studio
studio_counts <- data %>%
  count(Production.House) %>%
  arrange(desc(n))

top_ten_studios <- head(studio_counts, 10)

# Plot the bar chart
ggplot(top_ten_studios, aes(x = reorder(Production.House, -n), y = n)) +
  geom_bar(stat = "identity", fill = "blue") +
  labs(title = "10 Film studios that produced the most films",
       x = "Film studio",
       y = "Number of productions") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  coord_flip()

CHALLENGE 3 - How it was changing over the years? * There was some missing data that we did not take into account when making the plot

data <- data[complete.cases(data$Release.Date), ]

data$Year <- format(as.Date(data$Release.Date, format = "%d/%m/%Y"), "%Y")

# filtering missing values
data <- data[!is.na(data$Year), ]

netflix_counts <- data %>%
  filter(Production.House == "Netflix") %>%
  count(Year) %>%
  arrange(Year)

ggplot(netflix_counts, aes(x = Year, y = n)) +
  geom_line(color = "blue") +
  geom_point(color = "blue") +
  labs(title = "Films produced by Netflix over the years",
       x = "Year",
       y = "Number of films produced") +
  theme_minimal()