Tables & Plots Report

EXERCISE 1

data(CreditCard)
CreditCard<-CreditCard %>% filter(months!=0) %>% mutate(exp_per_mth = expenditure / months)

#diagnose_numeric(CreditCard)

tabela4<-Freq(CreditCard$exp_per_mth,useNA="ifany")

tabela4 %>%
kable(col.names = c("Avg exp in kUSD","Frequency","Percentage %","Cumulative frequency","Cumulative percentage %")) %>%
kable_styling(bootstrap_options = "striped") %>% 
kable_classic(full_width = F, html_font = "Arial")

Avg exp in kUSD	Frequency	Percentage %	Cumulative frequency	Cumulative percentage %
[0,50]	1231	0.9354103	1231	0.9354103
(50,100]	44	0.0334347	1275	0.9688450
(100,150]	14	0.0106383	1289	0.9794833
(150,200]	12	0.0091185	1301	0.9886018
(200,250]	8	0.0060790	1309	0.9946809
(250,300]	1	0.0007599	1310	0.9954407
(300,350]	0	0.0000000	1310	0.9954407
(350,400]	1	0.0007599	1311	0.9962006
(400,450]	1	0.0007599	1312	0.9969605
(450,500]	4	0.0030395	1316	1.0000000

EXERCISE 2 - DISTRIBUTION

netflix_data = pd.read_csv("https://raw.githubusercontent.com/kflisikowski/ds/master/netflix-dataset.csv?raw=true")
polish_movies = netflix_data[(netflix_data["Languages"] == "Polish") & (netflix_data["Series or Movie"] == "Movie")]
polish_series = netflix_data[(netflix_data["Languages"] == "Polish") & (netflix_data["Series or Movie"] == "Series")]

# Drop NaN IMDb 
polish_movies = polish_movies.dropna(subset=["IMDb Score"])
polish_series = polish_series.dropna(subset=["IMDb Score"])

plt.figure()
plt.hist(polish_movies["IMDb Score"], bins=25, color='skyblue', alpha=0.5, label='Polish Movies')
plt.hist(polish_series["IMDb Score"], bins=25, color='red', alpha=0.2, label='Polish Series')
plt.title('Distribution of IMDb Scores for Polish Movies and Series')
plt.xlabel('IMDb Score')
plt.ylabel('Amount')
plt.grid(True)
plt.legend()
plt.yticks(range(0, 5, 1))

## ([<matplotlib.axis.YTick object at 0x0000022AD55DFD00>, <matplotlib.axis.YTick object at 0x0000022AD55DF580>, <matplotlib.axis.YTick object at 0x0000022AD55CD400>, <matplotlib.axis.YTick object at 0x0000022AD425A040>, <matplotlib.axis.YTick object at 0x0000022AD425DAC0>], [Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, '')])

plt.show()

EXERCISE 2 - DENSITY

plt.figure()
sns.kdeplot(polish_movies["IMDb Score"], color='skyblue', label='Polish Movies', shade=True)
sns.kdeplot(polish_series["IMDb Score"], color='pink', label='Polish Series', shade=True)
plt.title('Kernel Density Estimation of IMDb Scores for Polish Movies and Series')
plt.xlabel('IMDb Score')
plt.ylabel('Density')
plt.legend()
plt.show()

EXERCISE 2 - LANGUAGES POPULARITY

languages = netflix_data['Languages'].str.split(', ').explode()
language_counts = languages.value_counts()
top_languages = language_counts.head(10)

plt.figure()
top_languages.plot(kind='bar', color='red')
plt.title('Most Popular Languages on Netflix')
plt.xlabel('Languages')
plt.ylabel('Availability')
plt.xticks(rotation=45, ha='right')

## (array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), [Text(0, 0, 'English'), Text(1, 0, 'Japanese'), Text(2, 0, 'Spanish'), Text(3, 0, 'French'), Text(4, 0, 'Korean'), Text(5, 0, 'German'), Text(6, 0, 'Hindi'), Text(7, 0, 'Mandarin'), Text(8, 0, 'Italian'), Text(9, 0, 'Russian')])

plt.tight_layout()
plt.show()

EXTRA CHALLENGE 1 - ACTORS IN MOST POPULAR PRODUCTIONS

top_3_productions = netflix_data.sort_values(by='IMDb Votes', ascending=False).head(10)
actors_counts = top_3_productions['Actors'].str.split(', ').explode().value_counts().head(6)
plt.figure(figsize=(10, 6))
actors_counts.plot(kind='bar', color='red')
plt.yticks(range(int(actors_counts.max()) + 1))

## ([<matplotlib.axis.YTick object at 0x0000022AD4EFF160>, <matplotlib.axis.YTick object at 0x0000022AD4D69AF0>, <matplotlib.axis.YTick object at 0x0000022AC862E340>], [Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, '')])

plt.title('Top 6 Actors in Most Popular Productions')
plt.xlabel('Actor')
plt.ylabel('Number of Appearances')
plt.tight_layout()
plt.show()

EXTRA CHALLENGE 2 - RATING CHARTS

movies = netflix_data[(netflix_data["Series or Movie"] == "Movie")]
series = netflix_data[(netflix_data["Series or Movie"] == "Series")]
movies_avg_score = movies[["Hidden Gem Score", "IMDb Score", "Rotten Tomatoes Score", "Metacritic Score"]].mean()
series_avg_score = series[["Hidden Gem Score", "IMDb Score", "Rotten Tomatoes Score", "Metacritic Score"]].mean()

#for hidden gem score and imdb we have to multiply by 10 to compare values easily 
movies_avg_score["Hidden Gem Score"]*=10
movies_avg_score["IMDb Score"]*=10
series_avg_score["Hidden Gem Score"]*=10
series_avg_score["IMDb Score"]*=10

plt.figure(figsize=(10, 6))
plt.plot(movies_avg_score.index, movies_avg_score, marker='o', linestyle='-', color='blue', label='Movies')
plt.plot(series_avg_score.index, series_avg_score, marker='o', linestyle='-', color='red', label='Series')
plt.xticks(rotation=45)

## ([0, 1, 2, 3], [Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, '')])

plt.xlabel('Score Type')
plt.ylabel('Average Score')
plt.title('Average Scores for Movies and Series')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

EXTRA CHALLENGE 3 - MOST PRODUCTIVE STUDIO

productive_studio = netflix_data['Production House'].value_counts().idxmax()
productive_studio_data = netflix_data[netflix_data['Production House'] == productive_studio]
productive_studio_data['Release Date'] = pd.to_datetime(productive_studio_data['Release Date'], format='%d/%m/%Y', errors='coerce')
productive_studio_data['Year'] = productive_studio_data['Release Date'].dt.year
production_count_by_year = productive_studio_data.groupby('Year').size()

plt.figure(figsize=(10, 6))
production_count_by_year.plot(kind='line', marker='o', color='red', linestyle='-')
plt.ylabel('Productions amount')
plt.xlabel('Year')
plt.title(productive_studio + ' - amount of productions over the years')
plt.grid(True)
plt.show()

Tables & Plots Report

Alicja(197772), Hubert(197740), Yaren(201924)

Last edited:

EXERCISE 1

EXERCISE 2 - DISTRIBUTION

EXERCISE 2 - DENSITY

EXERCISE 2 - LANGUAGES POPULARITY

EXTRA CHALLENGE 1 - ACTORS IN MOST POPULAR PRODUCTIONS

EXTRA CHALLENGE 2 - RATING CHARTS

EXTRA CHALLENGE 3 - MOST PRODUCTIVE STUDIO