Documento robusto para RPubs. Si
dfexiste pero no es data frame, se ignora y se lee el Excel.
Nombres de columnas limpiados sin depender dejanitor::make_clean_names(case=...).
req <- c("dplyr","ggplot2","tidyr","scales","stringi","rlang","readxl")
to_install <- setdiff(req, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
library(dplyr)
library(ggplot2)
library(tidyr)
library(scales)
library(stringi)
library(rlang)
library(readxl)
# Función de limpieza de nombres sin janitor (compatible 100%)
clean_names_vec <- function(x) {
x <- as.character(x)
# quitar acentos y caracteres latinos
x <- stringi::stri_trans_general(x, "Latin-ASCII")
# a minúsculas
x <- tolower(x)
# reemplazar no alfanum por _
x <- gsub("[^a-z0-9]+", "_", x, perl = TRUE)
# colapsar múltiples _
x <- gsub("_+", "_", x, perl = TRUE)
# quitar _ al inicio/fin
x <- gsub("^_+|_+$", "", x, perl = TRUE)
x
}
# Determinar origen de df
use_env_df <- exists("df")
if (use_env_df) {
# si existe, confirmar que es data.frame/tibble
if (!is.data.frame(df)) {
message("⚠️ 'df' existe pero no es data.frame; se leerá desde Excel.")
use_env_df <- FALSE
}
}
if (!use_env_df) {
excel_path <- "2016-2019 valores nominales.xlt (1).xlsx"
if (!file.exists(excel_path)) {
stop("No se encontró el archivo Excel. Ajuste 'excel_path' al path correcto.")
}
df <- readxl::read_excel(excel_path, sheet = "datos_originales")
}
# Forzar a data.frame clásico por si vienen clases raras
df <- as.data.frame(df, stringsAsFactors = FALSE)
# Limpiar nombres de columnas
colnames(df) <- clean_names_vec(colnames(df))
# Normalizar año -> anio
if ("ano" %in% names(df)) names(df)[names(df) == "ano"] <- "anio"
if (!"anio" %in% names(df) && "año" %in% names(df)) names(df)[names(df) == "año"] <- "anio"
# Unificar bancos_privatizados si viniera duplicada
if (!"bancos_privatizados" %in% names(df)) {
cand <- grep("^bancos_privatizados", names(df), value = TRUE)
if (length(cand) >= 1) names(df)[names(df) == cand[1]] <- "bancos_privatizados"
}
# Recode de factores útiles (si son 0/1 numéricos)
if ("reestructuracion" %in% names(df) && is.numeric(df$reestructuracion)) {
df$reestructuracion <- factor(df$reestructuracion, levels = c(0,1), labels = c("No","Si"))
}
if ("bancos_privatizados" %in% names(df) && is.numeric(df$bancos_privatizados)) {
df$bancos_privatizados <- factor(df$bancos_privatizados, levels = c(0,1), labels = c("No","Si"))
}
dplyr::glimpse(df)
## Rows: 96
## Columns: 8
## $ provincias <chr> "Buenos Aires", "CABA", "Catamarca", "Chaco", "Chu…
## $ reestructuracion <fct> Si, No, Si, Si, Si, Si, Si, Si, Si, Si, No, Si, Si…
## $ anio <dbl> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 20…
## $ resultado <dbl> -17181.554432, -10143.663669, 9.304292, -4924.7512…
## $ deudanac <dbl> 185976407.8, 11146195.3, 2495033.4, 12310491.0, 79…
## $ deudaext <dbl> 11020102.17, 31093595.77, 124885.99, 737297.01, 12…
## $ deseq_vertical <dbl> 0.3997638, 0.2306568, 0.8207827, 0.7350382, 0.3692…
## $ bancos_privatizados <fct> No, No, Si, No, No, No, No, Si, Si, Si, No, No, Si…
tabla_freq_cuali <- function(data, var) {
var_quo <- rlang::enquo(var)
data %>%
mutate(.var = !!var_quo) %>%
count(.var, name = "freq") %>%
mutate(
pct = 100 * freq / sum(freq),
freq_acum = cumsum(freq),
pct_acum = cumsum(pct)
) %>%
rename(!!rlang::as_name(var_quo) := .var) %>%
arrange(desc(freq))
}
if ("reestructuracion" %in% names(df)) {
tab_reestr <- tabla_freq_cuali(df, reestructuracion)
tab_reestr
}
if ("bancos_privatizados" %in% names(df)) {
tab_ban <- tabla_freq_cuali(df, bancos_privatizados)
tab_ban
}
if ("provincias" %in% names(df)) {
tab_prov <- df %>%
count(provincias, name = "freq") %>%
mutate(pct = 100*freq/sum(freq)) %>%
arrange(desc(freq))
head(tab_prov, 10)
}
grafico_torta <- function(tabla_cuali, var_lab = "Categoría") {
primero <- names(tabla_cuali)[1]
ggplot(tabla_cuali, aes(x = "", y = pct, fill = .data[[primero]])) +
geom_col(width = 1) +
coord_polar(theta = "y") +
geom_text(aes(label = paste0(round(pct,1),"%")),
position = position_stack(vjust = 0.5), size = 3) +
labs(x = NULL, y = NULL, fill = var_lab, title = paste0("Gráfico de sectores — ", var_lab)) +
theme_void()
}
grafico_barras <- function(tabla_cuali, var_lab = "Categoría") {
primero <- names(tabla_cuali)[1]
ggplot(tabla_cuali, aes(x = .data[[primero]], y = pct)) +
geom_col() +
geom_text(aes(label = paste0(round(pct,1),"%")), vjust = -0.4, size = 3) +
labs(x = var_lab, y = "%", title = paste0("Barras por % — ", var_lab)) +
theme_minimal()
}
if (exists("tab_reestr")) {
grafico_torta(tab_reestr, "Reestructuración")
grafico_barras(tab_reestr, "Reestructuración")
}
if (exists("tab_ban")) {
grafico_torta(tab_ban, "Bancos privatizados")
grafico_barras(tab_ban, "Bancos privatizados")
}
tabla_freq_cuanti <- function(data, var, k = NULL) {
x <- dplyr::pull(data, {{var}})
x <- x[!is.na(x)]
if (length(x) < 2) stop("La variable no tiene suficientes datos numéricos.")
if (is.null(k)) k <- nclass.Sturges(x)
brks <- seq(min(x), max(x), length.out = k + 1)
clase <- cut(x, breaks = brks, include.lowest = TRUE, right = TRUE)
tb <- as.data.frame(table(clase))
names(tb) <- c("clase", "freq")
tb <- tb %>%
mutate(
rel = freq / sum(freq),
pct = 100*rel,
acum = cumsum(freq),
rel_acum = cumsum(rel),
pct_acum = 100*rel_acum
)
lims <- do.call(rbind, strsplit(gsub("\\[|\\]|\\(|\\)", "", as.character(tb$clase)), ",")) %>%
as.data.frame()
names(lims) <- c("li","ls")
lims$li <- as.numeric(lims$li); lims$ls <- as.numeric(lims$ls)
tb <- dplyr::bind_cols(tb, lims) %>%
mutate(pm = (li + ls)/2)
tb
}
resumen_num <- function(x) {
x <- x[!is.na(x)]
stopifnot(is.numeric(x))
n <- length(x)
media <- mean(x); mediana <- median(x)
q <- as.numeric(quantile(x, probs = c(0, .25, .5, .75, 1)))
names(q) <- c("min","Q1","Q2_mediana","Q3","max")
rango <- diff(range(x))
var_ <- var(x); sd_ <- sd(x); iqr_ <- IQR(x)
cv <- ifelse(abs(media) > .Machine$double.eps, sd_/abs(media), NA_real_)
moda <- NA
if (n > 0) {
tab <- sort(table(x), decreasing = TRUE)
moda <- as.numeric(names(tab)[1])
}
tibble::tibble(
n = n, media = media, mediana = mediana,
min = q[1], Q1 = q[2], Q2 = q[3], Q3 = q[4], max = q[5],
rango = rango, varianza = var_, desvio = sd_, IQR = iqr_, CV = cv, moda = moda
)
}
grafico_hist <- function(data, var, bins = NULL) {
x <- dplyr::pull(data, {{var}})
if (!is.numeric(x)) stop("La variable debe ser numérica para histograma.")
if (is.null(bins)) bins <- nclass.Sturges(x)
ggplot(data, aes(x = {{var}})) +
geom_histogram(bins = bins) +
labs(x = rlang::as_name(rlang::enquo(var)), y = "Frecuencia",
title = paste0("Histograma — ", rlang::as_name(rlang::enquo(var)))) +
theme_minimal()
}
vars_num <- c("resultado","deudanac","deudaext","deseq_vertical")
vars_num <- vars_num[vars_num %in% names(df)]
for (v in vars_num) {
cat("\n\n### Variable:", v, "\n")
print(tabla_freq_cuanti(df, !!sym(v)))
print(resumen_num(dplyr::pull(df, !!sym(v))))
print(grafico_hist(df, !!sym(v)))
}
##
##
## ### Variable: resultado
## clase freq rel pct acum rel_acum pct_acum
## 1 [-1.72e+04,-9.83e+03] 5 0.05208333 5.208333 5 0.05208333 5.208333
## 2 (-9.83e+03,-2.48e+03] 24 0.25000000 25.000000 29 0.30208333 30.208333
## 3 (-2.48e+03,4.87e+03] 56 0.58333333 58.333333 85 0.88541667 88.541667
## 4 (4.87e+03,1.22e+04] 7 0.07291667 7.291667 92 0.95833333 95.833333
## 5 (1.22e+04,1.96e+04] 3 0.03125000 3.125000 95 0.98958333 98.958333
## 6 (1.96e+04,2.69e+04] 0 0.00000000 0.000000 95 0.98958333 98.958333
## 7 (2.69e+04,3.43e+04] 0 0.00000000 0.000000 95 0.98958333 98.958333
## 8 (3.43e+04,4.16e+04] 1 0.01041667 1.041667 96 1.00000000 100.000000
## li ls pm
## 1 -17200 -9830 -13515
## 2 -9830 -2480 -6155
## 3 -2480 4870 1195
## 4 4870 12200 8535
## 5 12200 19600 15900
## 6 19600 26900 23250
## 7 26900 34300 30600
## 8 34300 41600 37950
## # A tibble: 1 × 14
## n media mediana min Q1 Q2 Q3 max rango varianza desvio
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 96 -52.7 -284. -17182. -2896. -284. 1587. 41613. 58794. 45059086. 6713.
## # ℹ 3 more variables: IQR <dbl>, CV <dbl>, moda <dbl>
##
##
## ### Variable: deudanac
## clase freq rel pct acum rel_acum pct_acum
## 1 [2.7e+05,3.08e+07] 88 0.91666667 91.666667 88 0.9166667 91.66667
## 2 (3.08e+07,6.14e+07] 3 0.03125000 3.125000 91 0.9479167 94.79167
## 3 (6.14e+07,9.2e+07] 1 0.01041667 1.041667 92 0.9583333 95.83333
## 4 (9.2e+07,1.23e+08] 2 0.02083333 2.083333 94 0.9791667 97.91667
## 5 (1.23e+08,1.53e+08] 0 0.00000000 0.000000 94 0.9791667 97.91667
## 6 (1.53e+08,1.84e+08] 0 0.00000000 0.000000 94 0.9791667 97.91667
## 7 (1.84e+08,2.14e+08] 1 0.01041667 1.041667 95 0.9895833 98.95833
## 8 (2.14e+08,2.45e+08] 1 0.01041667 1.041667 96 1.0000000 100.00000
## li ls pm
## 1 2.70e+05 3.08e+07 15535000
## 2 3.08e+07 6.14e+07 46100000
## 3 6.14e+07 9.20e+07 76700000
## 4 9.20e+07 1.23e+08 107500000
## 5 1.23e+08 1.53e+08 138000000
## 6 1.53e+08 1.84e+08 168500000
## 7 1.84e+08 2.14e+08 199000000
## 8 2.14e+08 2.45e+08 229500000
## # A tibble: 1 × 14
## n media mediana min Q1 Q2 Q3 max rango varianza desvio
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 96 1.65e7 8.12e6 2.70e5 4.80e6 8.12e6 1.27e7 2.45e8 2.44e8 1.15e15 3.39e7
## # ℹ 3 more variables: IQR <dbl>, CV <dbl>, moda <dbl>
##
##
## ### Variable: deudaext
## clase freq rel pct acum rel_acum pct_acum
## 1 [0,6.84e+07] 90 0.93750000 93.750000 90 0.9375000 93.75000
## 2 (6.84e+07,1.37e+08] 3 0.03125000 3.125000 93 0.9687500 96.87500
## 3 (1.37e+08,2.05e+08] 1 0.01041667 1.041667 94 0.9791667 97.91667
## 4 (2.05e+08,2.74e+08] 0 0.00000000 0.000000 94 0.9791667 97.91667
## 5 (2.74e+08,3.42e+08] 0 0.00000000 0.000000 94 0.9791667 97.91667
## 6 (3.42e+08,4.1e+08] 1 0.01041667 1.041667 95 0.9895833 98.95833
## 7 (4.1e+08,4.79e+08] 0 0.00000000 0.000000 95 0.9895833 98.95833
## 8 (4.79e+08,5.47e+08] 1 0.01041667 1.041667 96 1.0000000 100.00000
## li ls pm
## 1 0.00e+00 6.84e+07 34200000
## 2 6.84e+07 1.37e+08 102700000
## 3 1.37e+08 2.05e+08 171000000
## 4 2.05e+08 2.74e+08 239500000
## 5 2.74e+08 3.42e+08 308000000
## 6 3.42e+08 4.10e+08 376000000
## 7 4.10e+08 4.79e+08 444500000
## 8 4.79e+08 5.47e+08 513000000
## # A tibble: 1 × 14
## n media mediana min Q1 Q2 Q3 max rango varianza desvio
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 96 2.33e7 1.86e6 0 4.26e5 1.86e6 1.99e7 5.47e8 5.47e8 4.95e15 7.04e7
## # ℹ 3 more variables: IQR <dbl>, CV <dbl>, moda <dbl>
##
##
## ### Variable: deseq_vertical
## clase freq rel pct acum rel_acum pct_acum li
## 1 [0.208,0.295] 8 0.08333333 8.333333 8 0.08333333 8.333333 0.208
## 2 (0.295,0.381] 3 0.03125000 3.125000 11 0.11458333 11.458333 0.295
## 3 (0.381,0.468] 9 0.09375000 9.375000 20 0.20833333 20.833333 0.381
## 4 (0.468,0.555] 17 0.17708333 17.708333 37 0.38541667 38.541667 0.468
## 5 (0.555,0.641] 12 0.12500000 12.500000 49 0.51041667 51.041667 0.555
## 6 (0.641,0.728] 18 0.18750000 18.750000 67 0.69791667 69.791667 0.641
## 7 (0.728,0.815] 15 0.15625000 15.625000 82 0.85416667 85.416667 0.728
## 8 (0.815,0.901] 14 0.14583333 14.583333 96 1.00000000 100.000000 0.815
## ls pm
## 1 0.295 0.2515
## 2 0.381 0.3380
## 3 0.468 0.4245
## 4 0.555 0.5115
## 5 0.641 0.5980
## 6 0.728 0.6845
## 7 0.815 0.7715
## 8 0.901 0.8580
## # A tibble: 1 × 14
## n media mediana min Q1 Q2 Q3 max rango varianza desvio IQR
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 96 0.611 0.631 0.208 0.491 0.631 0.753 0.901 0.693 0.0347 0.186 0.262
## # ℹ 2 more variables: CV <dbl>, moda <dbl>
if ("anio" %in% names(df)) {
df %>%
group_by(anio) %>%
summarise(
n = n(),
media_resultado = if ("resultado" %in% names(df)) mean(resultado, na.rm = TRUE) else NA_real_,
mediana_resultado = if ("resultado" %in% names(df)) median(resultado, na.rm = TRUE) else NA_real_
)
}
# χ²: reestructuracion vs bancos_privatizados (si existen)
if (all(c("reestructuracion","bancos_privatizados") %in% names(df))) {
print(chisq.test(table(df$reestructuracion, df$bancos_privatizados)))
}
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(df$reestructuracion, df$bancos_privatizados)
## X-squared = 0.51429, df = 1, p-value = 0.4733
# t-test de medias: resultado por bancos_privatizados
if (all(c("resultado","bancos_privatizados") %in% names(df))) {
print(t.test(resultado ~ bancos_privatizados, data = df))
}
##
## Welch Two Sample t-test
##
## data: resultado by bancos_privatizados
## t = 0.81344, df = 49.086, p-value = 0.4199
## alternative hypothesis: true difference in means between group No and group Si is not equal to 0
## 95 percent confidence interval:
## -1871.452 4417.008
## sample estimates:
## mean in group No mean in group Si
## 689.7137 -583.0641
# IC para la media de 'resultado' en 2019
if (all(c("anio","resultado") %in% names(df))) {
x2019 <- df %>% dplyr::filter(anio == 2019) %>% dplyr::pull(resultado)
if (sum(!is.na(x2019)) > 1) print(t.test(x2019, conf.level = 0.95))
}
# IC para proporción de reestructuración == "Si"
if ("reestructuracion" %in% names(df)) {
y <- as.integer(df$reestructuracion == "Si")
print(prop.test(sum(y, na.rm = TRUE), length(y)))
}
##
## 1-sample proportions test with continuity correction
##
## data: sum(y, na.rm = TRUE) out of length(y), null probability 0.5
## X-squared = 23.01, df = 1, p-value = 1.611e-06
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.6493381 0.8302969
## sample estimates:
## p
## 0.75
if ("provincias" %in% names(df)) {
df %>%
count(provincias, sort = TRUE) %>%
slice_head(n = 10)
}