EXERCISE 1
data(CreditCard)
top_ten_rows <- CreditCard %>%
filter(months != 0) %>%
mutate(AverageMonthlyExpenditure = expenditure / months) %>%
arrange(desc(AverageMonthlyExpenditure)) %>%
head(10)
kable_table <- kable(top_ten_rows, format = "html",
caption = "Top 10 Average Monthly Credit Card Expenditures") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
kable_table
| card | reports | age | income | share | expenditure | owner | selfemp | dependents | months | majorcards | active | AverageMonthlyExpenditure | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 472 | yes | 1 | 43.33333 | 4.80 | 0.3706127 | 1482.3680 | yes | no | 0 | 3 | 1 | 4 | 494.1227 |
| 765 | yes | 0 | 44.50000 | 9.00 | 0.2599927 | 1949.8620 | no | no | 2 | 4 | 1 | 7 | 487.4655 |
| 624 | yes | 0 | 27.16667 | 2.60 | 0.8778461 | 1902.0000 | no | no | 0 | 4 | 1 | 7 | 475.5000 |
| 1292 | yes | 0 | 22.00000 | 1.75 | 0.6214691 | 906.3092 | no | no | 0 | 2 | 1 | 0 | 453.1546 |
| 1181 | yes | 0 | 23.16667 | 2.60 | 0.8245508 | 1786.2770 | no | no | 0 | 4 | 1 | 0 | 446.5693 |
| 874 | yes | 0 | 25.58333 | 2.80 | 0.3252597 | 758.9391 | no | no | 0 | 2 | 1 | 7 | 379.4696 |
| 434 | yes | 0 | 28.91667 | 2.75 | 0.3822785 | 875.8884 | yes | no | 1 | 3 | 1 | 3 | 291.9628 |
| 487 | yes | 0 | 22.91667 | 2.50 | 0.2395248 | 498.9267 | no | no | 0 | 2 | 1 | 0 | 249.4633 |
| 763 | yes | 0 | 35.25000 | 6.00 | 0.4582348 | 2291.1740 | no | no | 2 | 10 | 1 | 0 | 229.1174 |
| 113 | yes | 0 | 28.16667 | 2.40 | 0.1137675 | 227.3683 | no | no | 0 | 1 | 1 | 29 | 227.3683 |
EXERCISE 2.1
data<-read.csv("https://raw.githubusercontent.com/kflisikowski/ds/master/netflix-dataset.csv?raw=true")
polish_films<- data[grep("Polish", data$Tags), ]
ggplot(polish_films, aes(x = IMDb.Score)) +
geom_histogram(binwidth = 0.1, fill = "red", color = "black") +
labs(title = "Distribution of Imdb scores for Polish movies and movies-series",
x = "Imdb Score",
y = "Quantity")
EXERCISE 2.2
ggplot(polish_films, aes(x = IMDb.Score)) +
geom_density(fill = "red", color = "black", alpha = 0.5) +
labs(title = "Distribution of IMDb scores for Polish movies and movie-series",
x = "IMDb Score",
y = "Density")
EXERCISE 2.3
language_counts <- data %>%
separate_rows(Languages, sep = ",\\s*") %>%
count(Languages, name = "Count") %>%
arrange(desc(Count))
ggplot(language_counts[1:10, ], aes(x = reorder(Languages, Count), y = Count)) +
geom_bar(stat = "identity", fill = "black") +
labs(title = "Top 10 Most Popular Languages on Netflix",
x = "Languages",
y = "Number of movies") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
CHALLENGE 1
seperated_actor_popularity <- data %>%
separate_rows(Actors, sep = ",\\s*")
actor_popularity <- seperated_actor_popularity %>%
group_by(Actors) %>%
summarize(TotalPopularity = sum(Awards.Nominated.For, na.rm = TRUE)) %>%
arrange(desc(TotalPopularity))
top_actors <- head(actor_popularity, 10)
ggplot(top_actors, aes(x = reorder(Actors, TotalPopularity), y = TotalPopularity, fill = Actors)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Actors starring in the most popular productions",
x = "Actor",
y = "movie's popularity") +
theme_minimal() +
theme(legend.position = "none")
CHALLENGE 2
# first graph with scale form 1 to 100
long_data1to100 <- data %>%
pivot_longer(
cols = c("Rotten.Tomatoes.Score", "Metacritic.Score"),
names_to = "Rating_Source_From_1_to_100",
values_to = "Score"
) %>%
filter(is.finite(Score))
ggplot(long_data1to100, aes(x = Rating_Source_From_1_to_100, y = Score, fill = Rating_Source_From_1_to_100)) +
geom_boxplot() +
scale_fill_manual(values = c("Rotten.Tomatoes.Score" = "red", "Metacritic.Score" = "purple")) +
labs(title = "Rating charts from the various portals scale from 1 to 100",
x = "Portal",
y = "Rate") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# second graph with scale form 1 to 10
long_data1to10 <- data %>%
pivot_longer(
cols = c("IMDb.Score", "Hidden.Gem.Score"),
names_to = "Rating_Source_From_1_to_10",
values_to = "Score"
) %>%
filter(is.finite(Score))
ggplot(long_data1to10, aes(x = Rating_Source_From_1_to_10, y = Score, fill = Rating_Source_From_1_to_10)) +
geom_boxplot() +
scale_fill_manual(values = c("IMDb.Score" = "yellow", "Hidden.Gem.Score" = "black")) +
labs(title = "Rating charts from the various portals scale from 1 to 10",
x = "Portal",
y = "Rate") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
CHALLENGE 3 - Which studio produced the most films?
# Count the frequency of each film studio
studio_counts <- data %>%
count(Production.House) %>%
arrange(desc(n))
top_ten_studios <- head(studio_counts, 10)
# Plot the bar chart
ggplot(top_ten_studios, aes(x = reorder(Production.House, -n), y = n)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "10 Film studios that produced the most films",
x = "Film studio",
y = "Number of productions") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
coord_flip()
CHALLENGE 3 - How it was changing over the years? * There was some missing data that we did not take into account when making the plot
data <- data[complete.cases(data$Release.Date), ]
data$Year <- format(as.Date(data$Release.Date, format = "%d/%m/%Y"), "%Y")
# filtering missing values
data <- data[!is.na(data$Year), ]
netflix_counts <- data %>%
filter(Production.House == "Netflix") %>%
count(Year) %>%
arrange(Year)
ggplot(netflix_counts, aes(x = Year, y = n)) +
geom_line(color = "blue") +
geom_point(color = "blue") +
labs(title = "Films produced by Netflix over the years",
x = "Year",
y = "Number of films produced") +
theme_minimal()