EXERCISE 1

data(CreditCard)
CreditCard<-CreditCard %>% filter(months!=0) %>% mutate(exp_per_mth = expenditure / months)

#diagnose_numeric(CreditCard)

tabela4<-Freq(CreditCard$exp_per_mth,useNA="ifany")

tabela4 %>%
kable(col.names = c("Avg exp in kUSD","Frequency","Percentage %","Cumulative frequency","Cumulative percentage %")) %>%
kable_styling(bootstrap_options = "striped") %>% 
kable_classic(full_width = F, html_font = "Arial")
Avg exp in kUSD Frequency Percentage % Cumulative frequency Cumulative percentage %
[0,50] 1231 0.9354103 1231 0.9354103
(50,100] 44 0.0334347 1275 0.9688450
(100,150] 14 0.0106383 1289 0.9794833
(150,200] 12 0.0091185 1301 0.9886018
(200,250] 8 0.0060790 1309 0.9946809
(250,300] 1 0.0007599 1310 0.9954407
(300,350] 0 0.0000000 1310 0.9954407
(350,400] 1 0.0007599 1311 0.9962006
(400,450] 1 0.0007599 1312 0.9969605
(450,500] 4 0.0030395 1316 1.0000000

EXERCISE 2 - DISTRIBUTION

netflix_data = pd.read_csv("https://raw.githubusercontent.com/kflisikowski/ds/master/netflix-dataset.csv?raw=true")
polish_movies = netflix_data[(netflix_data["Languages"] == "Polish") & (netflix_data["Series or Movie"] == "Movie")]
polish_series = netflix_data[(netflix_data["Languages"] == "Polish") & (netflix_data["Series or Movie"] == "Series")]

# Drop NaN IMDb 
polish_movies = polish_movies.dropna(subset=["IMDb Score"])
polish_series = polish_series.dropna(subset=["IMDb Score"])

plt.figure()
plt.hist(polish_movies["IMDb Score"], bins=25, color='skyblue', alpha=0.5, label='Polish Movies')
plt.hist(polish_series["IMDb Score"], bins=25, color='red', alpha=0.2, label='Polish Series')
plt.title('Distribution of IMDb Scores for Polish Movies and Series')
plt.xlabel('IMDb Score')
plt.ylabel('Amount')
plt.grid(True)
plt.legend()
plt.yticks(range(0, 5, 1))
## ([<matplotlib.axis.YTick object at 0x0000022AD55DFD00>, <matplotlib.axis.YTick object at 0x0000022AD55DF580>, <matplotlib.axis.YTick object at 0x0000022AD55CD400>, <matplotlib.axis.YTick object at 0x0000022AD425A040>, <matplotlib.axis.YTick object at 0x0000022AD425DAC0>], [Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, '')])
plt.show()

EXERCISE 2 - DENSITY

plt.figure()
sns.kdeplot(polish_movies["IMDb Score"], color='skyblue', label='Polish Movies', shade=True)
sns.kdeplot(polish_series["IMDb Score"], color='pink', label='Polish Series', shade=True)
plt.title('Kernel Density Estimation of IMDb Scores for Polish Movies and Series')
plt.xlabel('IMDb Score')
plt.ylabel('Density')
plt.legend()
plt.show()

EXERCISE 2 - LANGUAGES POPULARITY

languages = netflix_data['Languages'].str.split(', ').explode()
language_counts = languages.value_counts()
top_languages = language_counts.head(10)

plt.figure()
top_languages.plot(kind='bar', color='red')
plt.title('Most Popular Languages on Netflix')
plt.xlabel('Languages')
plt.ylabel('Availability')
plt.xticks(rotation=45, ha='right')
## (array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), [Text(0, 0, 'English'), Text(1, 0, 'Japanese'), Text(2, 0, 'Spanish'), Text(3, 0, 'French'), Text(4, 0, 'Korean'), Text(5, 0, 'German'), Text(6, 0, 'Hindi'), Text(7, 0, 'Mandarin'), Text(8, 0, 'Italian'), Text(9, 0, 'Russian')])
plt.tight_layout()
plt.show()

EXTRA CHALLENGE 2 - RATING CHARTS

movies = netflix_data[(netflix_data["Series or Movie"] == "Movie")]
series = netflix_data[(netflix_data["Series or Movie"] == "Series")]
movies_avg_score = movies[["Hidden Gem Score", "IMDb Score", "Rotten Tomatoes Score", "Metacritic Score"]].mean()
series_avg_score = series[["Hidden Gem Score", "IMDb Score", "Rotten Tomatoes Score", "Metacritic Score"]].mean()

#for hidden gem score and imdb we have to multiply by 10 to compare values easily 
movies_avg_score["Hidden Gem Score"]*=10
movies_avg_score["IMDb Score"]*=10
series_avg_score["Hidden Gem Score"]*=10
series_avg_score["IMDb Score"]*=10

plt.figure(figsize=(10, 6))
plt.plot(movies_avg_score.index, movies_avg_score, marker='o', linestyle='-', color='blue', label='Movies')
plt.plot(series_avg_score.index, series_avg_score, marker='o', linestyle='-', color='red', label='Series')
plt.xticks(rotation=45)
## ([0, 1, 2, 3], [Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, '')])
plt.xlabel('Score Type')
plt.ylabel('Average Score')
plt.title('Average Scores for Movies and Series')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

EXTRA CHALLENGE 3 - MOST PRODUCTIVE STUDIO

productive_studio = netflix_data['Production House'].value_counts().idxmax()
productive_studio_data = netflix_data[netflix_data['Production House'] == productive_studio]
productive_studio_data['Release Date'] = pd.to_datetime(productive_studio_data['Release Date'], format='%d/%m/%Y', errors='coerce')
productive_studio_data['Year'] = productive_studio_data['Release Date'].dt.year
production_count_by_year = productive_studio_data.groupby('Year').size()

plt.figure(figsize=(10, 6))
production_count_by_year.plot(kind='line', marker='o', color='red', linestyle='-')
plt.ylabel('Productions amount')
plt.xlabel('Year')
plt.title(productive_studio + ' - amount of productions over the years')
plt.grid(True)
plt.show()