Krátka analýza zameraná iba na hráčov s rankom 1
library(tidyverse)
library(haven)
library(lubridate)
library(janitor)
library(stringr)
# 1) načítaj dáta
path <- c(params$data_path, "./all.dta", "/mnt/data/all.dta") %>% purrr::keep(file.exists) %>% dplyr::first()
stopifnot(!is.null(path))
raw <- read_dta(path)
df <- labelled::to_factor(raw) %>% clean_names()
# 2) autodetekcia mien stĺpcov
nm <- names(df)
# hráč
guess_player <- c(params$player_var, nm[str_detect(nm, "player|hrac|nick|name|id")]) %>% purrr::discard(is.null) %>% dplyr::first()
# rank
guess_rank <- c(params$rank_var, nm[str_detect(nm, "(^|_)rank($|_)|rating_rank|ranking")]) %>% purrr::discard(is.null) %>% dplyr::first()
# dátum
date_candidates <- nm[vapply(df, inherits, logical(1), what = c("Date","POSIXct","POSIXt"))]
if (length(date_candidates) == 0) {
likely_date_names <- nm[str_detect(nm, "date|datum|day|time|datetime|match_date|played|created")]
for (v in likely_date_names){
dt <- suppressWarnings(ymd(df[[v]]))
if (sum(!is.na(dt)) > nrow(df)*0.2) { df[[v]] <- dt; date_candidates <- c(date_candidates, v); break }
}
}
guess_date <- dplyr::first(purrr::discard(c(params$date_var, date_candidates), is.null))
# sets won / lost (autodetekcia) – v datasete mám len swon
guess_sw <- dplyr::first(purrr::discard(c(
params$sets_won_var,
nm[stringr::str_detect(nm, "(^|_)(swon|sets?_won|won_sets|set_won|setsfor|sets_for)($|_)")]
), is.null))
# sets_lost neexistuje – vypočítame neskôr pomocou swon → lost
guess_sl <- c(
params$sets_lost_var,
nm[str_detect(nm, "(^|_)(sets?_lost|lost_sets|set_lost|sl|setsagainst|sets_against)($|_)")]
) %>% purrr::discard(is.null) %>% dplyr::first()
# 3) validácia a zostavenie core dát
if (is.null(guess_player) || is.null(guess_rank)) {
stop("Nepodarilo sa nájsť stĺpce player/rank. Zadaj mená do params$player_var a params$rank_var.")
}
# vyber povinné stĺpce
core <- df %>% dplyr::select(player = dplyr::all_of(guess_player), rank = dplyr::all_of(guess_rank))
# doplň date stĺpec — robustne, aj keď sa nenašiel
if (!is.null(guess_date) && guess_date %in% names(df)) {
if (inherits(df[[guess_date]], "Date")) {
core$date <- df[[guess_date]]
} else {
core$date <- suppressWarnings(as.Date(df[[guess_date]]))
}
} else {
core$date <- seq_len(nrow(core)) # náhradný „čas“ podľa poradia riadkov
}
# doplň sets_won (povinné) a z neho dopočítaj sets_lost podľa pravidla
if (!is.null(guess_sw) && guess_sw %in% names(df)) {
core$sets_won <- suppressWarnings(readr::parse_number(as.character(df[[guess_sw]])))
} else {
stop("Nenašiel som stĺpec s vyhratými setmi (swon). Nastav params$sets_won_var = 'swon'.")
}
# Pravidlo: swon == 0 alebo 1 → prehral 2 sety; swon >= 2 → prehral 0
core$sets_lost <- dplyr::case_when(
is.na(core$sets_won) ~ NA_real_,
core$sets_won %in% c(0,1) ~ 2,
core$sets_won >= 2 ~ 0,
TRUE ~ NA_real_
)
# typy/ranky/filtre
if (!is.numeric(core$rank)) core <- core %>% mutate(rank = readr::parse_number(as.character(rank)))
core <- core %>% filter(!is.na(player), !is.na(rank)) %>% arrange(player, date) %>% mutate(is_r1 = rank == 1)
# info
list(
n_rows = nrow(core), n_players = n_distinct(core$player),
vars = list(player = guess_player, rank = guess_rank, date = guess_date)
)
## $n_rows
## [1] 111941
##
## $n_players
## [1] 1999
##
## $vars
## $vars$player
## [1] "player"
##
## $vars$rank
## [1] "rank"
##
## $vars$date
## [1] NA
players_total <- n_distinct(core$player)
players_r1 <- core %>% filter(is_r1) %>% distinct(player) %>% nrow()
share_r1 <- players_r1 / players_total
p1 <- tibble(metric = c("hráči s rank1", "ostatní"),
value = c(players_r1, players_total - players_r1)) %>%
ggplot(aes(x = metric, y = value)) +
geom_col() +
geom_text(aes(label = value), vjust = -0.3) +
labs(title = "Počet hráčov, ktorí niekedy dosiahli rank 1",
x = NULL, y = "Počet hráčov") +
theme_minimal()
print(p1)
Interpretácia: Stĺpec „hráči s rank1“ ukazuje, koľko hráčov sa aspoň raz dostalo na prvé miesto rebríčku. Druhý stĺpec zobrazuje všetkých ostatných.
p2 <- tibble(group = c("Rank 1 niekedy", "Nikdy"), share = c(share_r1, 1 - share_r1)) %>%
ggplot(aes(x = group, y = share)) +
geom_col() +
scale_y_continuous(labels = scales::percent) +
geom_text(aes(label = scales::percent(share, accuracy = 0.1)), vjust = -0.3) +
labs(title = "Podiel hráčov, ktorí niekedy dosiahli rank 1",
x = NULL, y = "% hráčov") +
theme_minimal()
print(p2)
Interpretácia: Výška stĺpca vyjadruje, koľko % všetkých hráčov získalo rank 1 aspoň raz.
r1_sets <- core %>% filter(is_r1) %>%
group_by(player) %>%
summarise(sets_won = sum(sets_won, na.rm = TRUE),
sets_lost = sum(sets_lost, na.rm = TRUE), .groups = "drop")
long_r1 <- r1_sets %>% pivot_longer(c(sets_won, sets_lost), names_to = "type", values_to = "sets")
p3 <- long_r1 %>%
group_by(type) %>% summarise(total = sum(sets), .groups = "drop") %>%
ggplot(aes(x = type, y = total)) +
geom_col() +
geom_text(aes(label = total), vjust = -0.3) +
labs(title = "Súčet vyhratých a prehratých setov počas ranku 1 (všetci hráči)",
subtitle = "Pravidlo: swon 0/1 ⇒ prehrané 2 sety; swon ≥ 2 ⇒ prehrané 0",
x = NULL, y = "Počet setov") +
theme_minimal()
print(p3)
Interpretácia: Vidíme celkový „objem“ setov, ktoré hráči odohrali v tom období, keď mali rank 1, rozdelené na výhry a prehry.
r1_leaderboard <- r1_sets %>% mutate(net_sets = sets_won - sets_lost,
win_rate = sets_won/pmax(1, (sets_won + sets_lost)))
p4 <- r1_leaderboard %>%
slice_max(net_sets, n = 10, with_ties = FALSE) %>%
mutate(player = forcats::fct_reorder(player, net_sets)) %>%
ggplot(aes(x = player, y = net_sets)) +
geom_col() + coord_flip() +
labs(title = "Top 10 hráčov podľa čistého skóre setov (rank 1)",
subtitle = "net_sets = sets_won − sets_lost",
x = NULL, y = "Čisté sety") +
theme_minimal()
print(p4)
Interpretácia: Rebríček zvýrazňuje, kto mal počas ranku 1 najvýraznejšiu dominanciu v setoch (čisté skóre).