Data Visualization

Aleksandra Templin & Ksawery Raupuk

2024-04-16

EXERCISE 1

data(CreditCard)
top_ten_rows <- CreditCard %>%
  filter(months != 0) %>%
  mutate(AverageMonthlyExpenditure = expenditure / months) %>%
  arrange(desc(AverageMonthlyExpenditure)) %>%
  head(10)

kable_table <- kable(top_ten_rows, format = "html", 
                     caption = "Top 10 Average Monthly Credit Card Expenditures") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

kable_table

Top 10 Average Monthly Credit Card Expenditures
	card	reports	age	income	share	expenditure	owner	selfemp	dependents	months	majorcards	active	AverageMonthlyExpenditure
472	yes	1	43.33333	4.80	0.3706127	1482.3680	yes	no	0	3	1	4	494.1227
765	yes	0	44.50000	9.00	0.2599927	1949.8620	no	no	2	4	1	7	487.4655
624	yes	0	27.16667	2.60	0.8778461	1902.0000	no	no	0	4	1	7	475.5000
1292	yes	0	22.00000	1.75	0.6214691	906.3092	no	no	0	2	1	0	453.1546
1181	yes	0	23.16667	2.60	0.8245508	1786.2770	no	no	0	4	1	0	446.5693
874	yes	0	25.58333	2.80	0.3252597	758.9391	no	no	0	2	1	7	379.4696
434	yes	0	28.91667	2.75	0.3822785	875.8884	yes	no	1	3	1	3	291.9628
487	yes	0	22.91667	2.50	0.2395248	498.9267	no	no	0	2	1	0	249.4633
763	yes	0	35.25000	6.00	0.4582348	2291.1740	no	no	2	10	1	0	229.1174
113	yes	0	28.16667	2.40	0.1137675	227.3683	no	no	0	1	1	29	227.3683

EXERCISE 2.1

data<-read.csv("https://raw.githubusercontent.com/kflisikowski/ds/master/netflix-dataset.csv?raw=true")
polish_films<- data[grep("Polish", data$Tags), ]
ggplot(polish_films, aes(x = IMDb.Score)) +
  geom_histogram(binwidth = 0.1, fill = "red", color = "black") +
  labs(title = "Distribution of Imdb scores for Polish movies and movies-series",
       x = "Imdb Score",
       y = "Quantity")

EXERCISE 2.2

ggplot(polish_films, aes(x = IMDb.Score)) +
  geom_density(fill = "red", color = "black", alpha = 0.5) +  
  labs(title = "Distribution of IMDb scores for Polish movies and movie-series",
       x = "IMDb Score",
       y = "Density")

EXERCISE 2.3

language_counts <- data %>%
  separate_rows(Languages, sep = ",\\s*") %>%
  count(Languages, name = "Count") %>%
  arrange(desc(Count))
ggplot(language_counts[1:10, ], aes(x = reorder(Languages, Count), y = Count)) +
  geom_bar(stat = "identity", fill = "black") +
  labs(title = "Top 10 Most Popular Languages on Netflix",
       x = "Languages",
       y = "Number of movies") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

CHALLENGE 1

seperated_actor_popularity <- data %>%
  separate_rows(Actors, sep = ",\\s*")

actor_popularity <- seperated_actor_popularity %>%
  group_by(Actors) %>%
  summarize(TotalPopularity = sum(Awards.Nominated.For, na.rm = TRUE)) %>%
  arrange(desc(TotalPopularity))

top_actors <- head(actor_popularity, 10)
ggplot(top_actors, aes(x = reorder(Actors, TotalPopularity), y = TotalPopularity, fill = Actors)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Actors starring in the most popular productions",
       x = "Actor",
       y = "movie's popularity") +
  theme_minimal() +
  theme(legend.position = "none")

CHALLENGE 2

# first graph with scale form 1 to 100
long_data1to100 <- data %>%
  pivot_longer(
    cols = c("Rotten.Tomatoes.Score", "Metacritic.Score"),
    names_to = "Rating_Source_From_1_to_100",
    values_to = "Score"
  ) %>%
  filter(is.finite(Score))

ggplot(long_data1to100, aes(x = Rating_Source_From_1_to_100, y = Score, fill = Rating_Source_From_1_to_100)) +
  geom_boxplot() +
  scale_fill_manual(values = c("Rotten.Tomatoes.Score" = "red", "Metacritic.Score" = "purple")) +
  labs(title = "Rating charts from the various portals scale from 1 to 100",
       x = "Portal",
       y = "Rate") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# second graph with scale form 1 to 10
long_data1to10 <- data %>%
  pivot_longer(
    cols = c("IMDb.Score", "Hidden.Gem.Score"),
    names_to = "Rating_Source_From_1_to_10",
    values_to = "Score"
  ) %>%
  filter(is.finite(Score))

ggplot(long_data1to10, aes(x = Rating_Source_From_1_to_10, y = Score, fill = Rating_Source_From_1_to_10)) +
  geom_boxplot() +
  scale_fill_manual(values = c("IMDb.Score" = "yellow", "Hidden.Gem.Score" = "black")) +
  labs(title = "Rating charts from the various portals scale from 1 to 10",
       x = "Portal",
       y = "Rate") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

CHALLENGE 3 - Which studio produced the most films?

# Count the frequency of each film studio
studio_counts <- data %>%
  count(Production.House) %>%
  arrange(desc(n))

top_ten_studios <- head(studio_counts, 10)

# Plot the bar chart
ggplot(top_ten_studios, aes(x = reorder(Production.House, -n), y = n)) +
  geom_bar(stat = "identity", fill = "blue") +
  labs(title = "10 Film studios that produced the most films",
       x = "Film studio",
       y = "Number of productions") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  coord_flip()

CHALLENGE 3 - How it was changing over the years? * There was some missing data that we did not take into account when making the plot

data <- data[complete.cases(data$Release.Date), ]

data$Year <- format(as.Date(data$Release.Date, format = "%d/%m/%Y"), "%Y")

# filtering missing values
data <- data[!is.na(data$Year), ]

netflix_counts <- data %>%
  filter(Production.House == "Netflix") %>%
  count(Year) %>%
  arrange(Year)

ggplot(netflix_counts, aes(x = Year, y = n)) +
  geom_line(color = "blue") +
  geom_point(color = "blue") +
  labs(title = "Films produced by Netflix over the years",
       x = "Year",
       y = "Number of films produced") +
  theme_minimal()