DataVisualisationReport
Exercise 1.
import numpy as np import pandas as pd import matplotlib.pyplot as plt
data = pd.read_csv(âcredit_card.csvâ) # Calculate average monthly credit card expenditure data[âavg_expenditureâ] = data[âexpenditureâ] / data[âmonthsâ] data.head()
data[âavg_expenditureâ].plot(figsize=(10, 6), color=âskyblueâ, alpha=0.7) plt.title(âBar Chart of Average Monthly Expenditureâ) plt.xlabel(âIndexâ) plt.ylabel(âAverage Monthly Expenditureâ) plt.grid(True) plt.show()
from tabulate import tabulate
Frequency distribution
frequency_table = data[âavg_expenditureâ].value_counts().reset_index() frequency_table.columns = [âAverage Monthly Expenditureâ, âFrequencyâ]
Print frequency table, first 1000 characters
print(tabulate(frequency_table, headers=âkeysâ, tablefmt=âpipeâ)[:1000])
Jupiter file
There is a Jupiter file with this python code, plots and tables in repository https://github.com/WiktoriaKop/descriptiveStat.git in file visualisationReport.
Exercise 2.
Answer with the most appropriate data visualization for the following questions:
- What is the distribution of Imdb scores for Polish movies and movie-series?
Imdb_scores_distribution <- mydata %>%
filter(grepl("Polish", Tags)) %>%
group_by(Series.or.Movie)
ggplot(Imdb_scores_distribution, aes(x = IMDb.Score, fill = after_stat(count))) +
geom_histogram(binwidth = 0.1) +
scale_fill_gradient(high = "lightblue", low = "darkblue") +
scale_y_continuous(breaks = seq(0, 8, by = 1)) +
scale_x_continuous(breaks = seq(3, 9, by = 0.5)) +
facet_wrap(vars(Series.or.Movie)) +
scale_fill_distiller(direction = -1, palette = 4) +
labs(title = "Distribution of IMDb Scores", y = "frequency", x = "IMDb Score (0 - 10)") +
theme_dark()## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
- What is the density function of Imdb scores for Polish movies and movie-series?
ggplot(Imdb_scores_distribution, aes(x = IMDb.Score)) +
geom_density(color = "navyblue",size = 0.7) +
scale_x_continuous(breaks = seq(3, 9, by = 0.5)) +
scale_y_continuous(breaks = seq(0, 1, by = 0.1)) +
labs(title = "Density of IMDb sores for Polish movies and series together", x = "IMDb scores")## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## âč Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(Imdb_scores_distribution, aes(x = IMDb.Score)) +
geom_density(color = "navyblue",size = 0.7) +
scale_x_continuous(breaks = seq(3, 9, by = 0.5)) +
scale_y_continuous(breaks = seq(0, 10, by = 0.1)) +
facet_wrap(~Series.or.Movie) +
labs(title = "Density of IMDb sores for Polish movies and series", x = "IMDb scores")- What are the most popular languages available on Netflix?
language_separated <- mydata %>%
separate_rows(Languages, sep = ", ") %>%
mutate(Languages = fct_rev(fct_infreq(Languages))) %>%
filter(Languages != "") %>%
mutate(language_lump = fct_lump(Languages, n = 30))
ggplot(language_separated, aes(y = language_lump)) +
geom_bar() +
scale_x_continuous(breaks = seq(0, 6500, by = 300)) +
labs(title = "Most popular languages on Netflix", y = "Language")language_separated %>%
group_by(Languages) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
top_n(10) %>%
kbl(align = c(rep("c", 7), rep("r", 5)), caption = "TOP 10 MOST POPULAR LANGUAGES ON NETFLIX") %>%
kable_styling(bootstrap_options = "striped")## Selecting by count
| Languages | count |
|---|---|
| English | 6170 |
| Japanese | 1177 |
| Spanish | 837 |
| French | 801 |
| Korean | 562 |
| German | 489 |
| Hindi | 349 |
| Mandarin | 335 |
| Italian | 312 |
| Russian | 278 |
For extra credits:
Extra challenge 1.: Create a chart showing actors starring in the most popular productions.
actors_production <- data2 %>%
select(Title, IMDb.Votes, Actor) %>%
arrange(desc(IMDb.Votes)) %>%
unique()
top_productions <- actors_production %>%
select(Title) %>%
unique %>%
head(100)
actors <- actors_production %>%
filter(Title %in% top_productions$Title)
actors_top <- actors %>%
group_by(Actor) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
head(10)Extra challenge 2.: For movies and series, create rating charts from the various portals (Hidden Gem, IMDb, Rotten Tomatoes, Metacritic). Hint: itâs a good idea to reshape the data to long format.
data_films <- data2 %>%
arrange(desc(IMDb.Votes)) %>%
select(Title, Hidden.Gem.Score,IMDb.Score,
Rotten.Tomatoes.Score, Metacritic.Score) %>%
unique() %>%
head(10)
data_films_long <- data_films %>%
gather(Score, Portals, -Title)ggplot(data_films_long, aes(y = Title, x = Portals)) +
geom_col(position="dodge", fill = "orange4") +
facet_wrap(~ Score, scales="free") +
labs(title = "Top 10 Productions Scores by Website",
x = "",
y = "Title")Extra challenge 3.: Which film studios produce the most and how has this changed over the years?
studios <- data2 %>%
select(Title, Production.House, Release.Date) %>%
filter(Production.House != "" & Release.Date != "") %>%
unique()
studios_top <- studios %>%
group_by(Production.House) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
head(4)
studio_new <- studios %>%
filter(Production.House %in% studios_top$Production.House) %>%
group_by(Production.House, Release.Date) %>%
summarize(count = n())