EXERCISE 1
data(CreditCard)
CreditCard<-CreditCard %>% filter(months!=0) %>% mutate(exp_per_mth = expenditure / months)
#diagnose_numeric(CreditCard)
tabela4<-Freq(CreditCard$exp_per_mth,useNA="ifany")
tabela4 %>%
kable(col.names = c("Avg exp in kUSD","Frequency","Percentage %","Cumulative frequency","Cumulative percentage %")) %>%
kable_styling(bootstrap_options = "striped") %>%
kable_classic(full_width = F, html_font = "Arial")
Avg exp in kUSD
|
Frequency
|
Percentage %
|
Cumulative frequency
|
Cumulative percentage %
|
[0,50]
|
1231
|
0.9354103
|
1231
|
0.9354103
|
(50,100]
|
44
|
0.0334347
|
1275
|
0.9688450
|
(100,150]
|
14
|
0.0106383
|
1289
|
0.9794833
|
(150,200]
|
12
|
0.0091185
|
1301
|
0.9886018
|
(200,250]
|
8
|
0.0060790
|
1309
|
0.9946809
|
(250,300]
|
1
|
0.0007599
|
1310
|
0.9954407
|
(300,350]
|
0
|
0.0000000
|
1310
|
0.9954407
|
(350,400]
|
1
|
0.0007599
|
1311
|
0.9962006
|
(400,450]
|
1
|
0.0007599
|
1312
|
0.9969605
|
(450,500]
|
4
|
0.0030395
|
1316
|
1.0000000
|
EXERCISE 2 - DISTRIBUTION
netflix_data = pd.read_csv("https://raw.githubusercontent.com/kflisikowski/ds/master/netflix-dataset.csv?raw=true")
polish_movies = netflix_data[(netflix_data["Languages"] == "Polish") & (netflix_data["Series or Movie"] == "Movie")]
polish_series = netflix_data[(netflix_data["Languages"] == "Polish") & (netflix_data["Series or Movie"] == "Series")]
# Drop NaN IMDb
polish_movies = polish_movies.dropna(subset=["IMDb Score"])
polish_series = polish_series.dropna(subset=["IMDb Score"])
plt.figure()
plt.hist(polish_movies["IMDb Score"], bins=25, color='skyblue', alpha=0.5, label='Polish Movies')
plt.hist(polish_series["IMDb Score"], bins=25, color='red', alpha=0.2, label='Polish Series')
plt.title('Distribution of IMDb Scores for Polish Movies and Series')
plt.xlabel('IMDb Score')
plt.ylabel('Amount')
plt.grid(True)
plt.legend()
plt.yticks(range(0, 5, 1))
## ([<matplotlib.axis.YTick object at 0x0000022AD55DFD00>, <matplotlib.axis.YTick object at 0x0000022AD55DF580>, <matplotlib.axis.YTick object at 0x0000022AD55CD400>, <matplotlib.axis.YTick object at 0x0000022AD425A040>, <matplotlib.axis.YTick object at 0x0000022AD425DAC0>], [Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, '')])
plt.show()

EXERCISE 2 - DENSITY
plt.figure()
sns.kdeplot(polish_movies["IMDb Score"], color='skyblue', label='Polish Movies', shade=True)
sns.kdeplot(polish_series["IMDb Score"], color='pink', label='Polish Series', shade=True)
plt.title('Kernel Density Estimation of IMDb Scores for Polish Movies and Series')
plt.xlabel('IMDb Score')
plt.ylabel('Density')
plt.legend()
plt.show()

EXERCISE 2 - LANGUAGES POPULARITY
languages = netflix_data['Languages'].str.split(', ').explode()
language_counts = languages.value_counts()
top_languages = language_counts.head(10)
plt.figure()
top_languages.plot(kind='bar', color='red')
plt.title('Most Popular Languages on Netflix')
plt.xlabel('Languages')
plt.ylabel('Availability')
plt.xticks(rotation=45, ha='right')
## (array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), [Text(0, 0, 'English'), Text(1, 0, 'Japanese'), Text(2, 0, 'Spanish'), Text(3, 0, 'French'), Text(4, 0, 'Korean'), Text(5, 0, 'German'), Text(6, 0, 'Hindi'), Text(7, 0, 'Mandarin'), Text(8, 0, 'Italian'), Text(9, 0, 'Russian')])
plt.tight_layout()
plt.show()
