import sys
import subprocess
import importlib.util
required = {
"numpy": "numpy",
"pandas": "pandas",
"scipy": "scipy",
"sklearn": "scikit-learn",
"matplotlib": "matplotlib",
"tqdm": "tqdm",
}
missing = [pip_name for mod_name, pip_name in required.items()
if importlib.util.find_spec(mod_name) is None]
if missing:
print("Instalando paquetes faltantes:", ", ".join(missing))
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + missing)
print("Instalación terminada.")
import time
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy.stats import randint
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
confusion_matrix,
classification_report,
accuracy_score,
precision_score,
recall_score,
f1_score,
roc_auc_score
)
from sklearn.model_selection import (
StratifiedKFold,
RepeatedStratifiedKFold,
cross_validate,
ParameterSampler
)
from tqdm.notebook import tqdm
start_time = time.time()
x = pd.read_csv('/home/felorrieta/Downloads/path_signature_iisignature_M9.csv')
y = pd.read_csv('/home/felorrieta/Catalina/ts_v9.0.1_SMBH_ZTF_xmatch.csv')
y["id"] = y["oid"]
data = pd.merge(x, y, on="id")
data_modelado = data.sample(frac=0.8, random_state=42).reset_index(drop=True)
data_test = data.drop(data_modelado.index).reset_index(drop=True)
X_train = data_modelado.drop(
columns=['oid', 'survey_class_mapped', 'survey_class', 'survey_class_cat', 'id']
)
y_train = data_modelado['survey_class_mapped']
X_test = data_test[X_train.columns].copy()
y_test = data_test['survey_class_mapped']
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
labels = le.classes_
print("Clases codificadas:")
print(dict(enumerate(labels)))
pesos_por_nombre = {
'AGN': 2.0,
'Blazar': 3.0,
'QSO': 1.0
}
class_weight_dict = {}
for cls_name, peso in pesos_por_nombre.items():
if cls_name in le.classes_:
class_weight_dict[le.transform([cls_name])[0]] = peso
print("\nclass_weight_dict usado:")
print(class_weight_dict)
def evaluar_modelo(clf, X_tr, y_tr, X_te, y_te):
y_pred_tr = clf.predict(X_tr)
y_pred_te = clf.predict(X_te)
out = {
'cm_train': confusion_matrix(y_tr, y_pred_tr),
'cm_test': confusion_matrix(y_te, y_pred_te),
'acc_train': accuracy_score(y_tr, y_pred_tr),
'prec_train': precision_score(y_tr, y_pred_tr, average='weighted', zero_division=0),
'rec_train': recall_score(y_tr, y_pred_tr, average='weighted', zero_division=0),
'f1_train': f1_score(y_tr, y_pred_tr, average='weighted', zero_division=0),
'acc_test': accuracy_score(y_te, y_pred_te),
'prec_test': precision_score(y_te, y_pred_te, average='weighted', zero_division=0),
'rec_test': recall_score(y_te, y_pred_te, average='weighted', zero_division=0),
'f1_test': f1_score(y_te, y_pred_te, average='weighted', zero_division=0),
}
if hasattr(clf, "predict_proba"):
try:
proba_tr = clf.predict_proba(X_tr)
proba_te = clf.predict_proba(X_te)
out['auc_train'] = roc_auc_score(
y_tr, proba_tr,
multi_class="ovr",
average="weighted"
)
out['auc_test'] = roc_auc_score(
y_te, proba_te,
multi_class="ovr",
average="weighted"
)
except Exception:
out['auc_train'] = np.nan
out['auc_test'] = np.nan
else:
out['auc_train'] = np.nan
out['auc_test'] = np.nan
return out
def print_bloque_modelo(nombre, idx, params, metrics, labels):
print(f"{nombre} #{idx} | params = {params}")
print("Matriz de confusión — Entrenamiento")
df_cm_tr = pd.DataFrame(metrics['cm_train'], index=labels, columns=labels)
print(df_cm_tr)
print("\nMatriz de confusión — Test")
df_cm_te = pd.DataFrame(metrics['cm_test'], index=labels, columns=labels)
print(df_cm_te)
print("\nMÉTRICAS (train)")
print(f" Accuracy : {metrics['acc_train']:.3f}")
print(f" Precision: {metrics['prec_train']:.3f}")
print(f" Recall : {metrics['rec_train']:.3f}")
print(f" F1-score : {metrics['f1_train']:.3f}")
print(f" AUC : {metrics.get('auc_train', np.nan):.3f}")
print("\nMÉTRICAS (test)")
print(f" Accuracy : {metrics['acc_test']:.3f}")
print(f" Precision: {metrics['prec_test']:.3f}")
print(f" Recall : {metrics['rec_test']:.3f}")
print(f" F1-score : {metrics['f1_test']:.3f}")
print(f" AUC : {metrics.get('auc_test', np.nan):.3f}")
def _row_normalize(cm):
cm = cm.astype(float)
row_sums = cm.sum(axis=1, keepdims=True)
row_sums[row_sums == 0] = 1.0
return (cm / row_sums) * 100.0
def save_confusion_train_test(cm_train, cm_test, labels, outpath,
title_prefix="", subtitle="",
gap_width=0.28, wspace=0.15,
label_fontsize=13, tick_fontsize=13, title_fontsize=14):
cm_tr_pct = _row_normalize(cm_train)
cm_te_pct = _row_normalize(cm_test)
fig = plt.figure(figsize=(10.8, 4.8))
gs = gridspec.GridSpec(
1, 4,
width_ratios=[1, gap_width, 1, 0.08],
wspace=wspace
)
ax1 = fig.add_subplot(gs[0, 0])
ax_gap = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])
ax_cbar = fig.add_subplot(gs[0, 3])
ax_gap.axis("off")
panels = [
(ax1, cm_tr_pct, cm_train, "Train"),
(ax2, cm_te_pct, cm_test, "Test"),
]
vmin, vmax = 0, 100
for ax, cm_pct, cm_cnt, title in panels:
im = ax.imshow(cm_pct, cmap="BuPu", vmin=vmin, vmax=vmax)
ax.set_title(title, fontsize=title_fontsize)
ax.set_xticks(np.arange(len(labels)))
ax.set_yticks(np.arange(len(labels)))
ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=tick_fontsize)
ax.set_yticklabels(labels, fontsize=tick_fontsize)
ax.set_xlabel("Predicho", fontsize=label_fontsize)
ax.set_ylabel("Real", fontsize=label_fontsize)
thr = 50
for i in range(cm_pct.shape[0]):
for j in range(cm_pct.shape[1]):
pct = cm_pct[i, j]
cnt = int(cm_cnt[i, j])
color_txt = "white" if pct > thr else "black"
ax.text(j, i - 0.10, f"{pct:.1f}%",
ha="center", va="center",
color=color_txt, fontsize=10, fontweight="bold")
ax.text(j, i + 0.22, f"({cnt})",
ha="center", va="center",
color=color_txt, fontsize=7)
fig.colorbar(im, cax=ax_cbar, label="% por fila (clase real)")
fig.suptitle(f"{title_prefix}\n{subtitle}", fontsize=13, y=0.98)
fig.subplots_adjust(left=0.08, right=0.92, bottom=0.22, top=0.82)
fig.savefig(outpath, dpi=300, bbox_inches="tight")
plt.close(fig)
print("\nShapes:")
print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)
import time
import numpy as np
import pandas as pd
from scipy.stats import randint
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
confusion_matrix,
classification_report,
roc_auc_score
)
from sklearn.model_selection import (
StratifiedKFold,
RepeatedStratifiedKFold,
cross_validate,
ParameterSampler
)
from tqdm.notebook import tqdm
start_time = time.time()
cv_strategy = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
scoring_metric = "roc_auc_ovr_weighted"
rf_base = RandomForestClassifier(
random_state=42,
n_jobs=-1,
class_weight=class_weight_dict,
bootstrap=True
)
max_features_opts = ["sqrt", "log2"] + [round(v, 2) for v in np.linspace(0.2, 0.8, 7)]
param_dist_robusto = {
"n_estimators": randint(600, 5001),
"max_depth": [None] + list(range(4, 21)),
"max_features": max_features_opts,
"min_samples_split": randint(20, 201),
"min_samples_leaf": randint(5, 51),
"max_samples": [0.4, 0.6, 0.8],
"criterion": ["gini", "entropy"],
}
n_iter = 200
param_list = list(ParameterSampler(param_dist_robusto, n_iter=n_iter, random_state=42))
search_rows = []
with tqdm(
total=n_iter,
desc="RF RandomizedSearch",
leave=True,
dynamic_ncols=True
) as pbar:
for i, params_i in enumerate(param_list, start=1):
rf_i = clone(rf_base)
rf_i.set_params(**params_i)
cv_out = cross_validate(
rf_i,
X_train,
y_train_encoded,
cv=cv_strategy,
scoring=scoring_metric,
return_train_score=True,
n_jobs=-1
)
mean_test_score = float(np.mean(cv_out["test_score"]))
std_test_score = float(np.std(cv_out["test_score"]))
mean_train_score = float(np.mean(cv_out["train_score"]))
gap_cv_auc = mean_train_score - mean_test_score
search_rows.append({
"params": params_i,
"mean_test_score": mean_test_score,
"std_test_score": std_test_score,
"mean_train_score": mean_train_score,
"gap_cv_auc": gap_cv_auc,
})
pbar.update(1)
pbar.set_postfix({
"iter": i,
"best_auc": f"{max(r['mean_test_score'] for r in search_rows):.4f}"
})
results_df = pd.DataFrame(search_rows).copy()
results_df["rank_test_score"] = results_df["mean_test_score"].rank(
ascending=False, method="min"
).astype(int)
top5 = results_df.nlargest(5, "mean_test_score")[
["params", "mean_test_score", "std_test_score", "mean_train_score", "gap_cv_auc", "rank_test_score"]
].reset_index(drop=True)
print("\n" + "#" * 80)
print("TOP 5 (según AUC CV)")
print("#" * 80)
print(top5.to_string(index=True))
rskf = RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=42)
labels = le.classes_
resumen = []
def eval_test(model, name):
y_pred = model.predict(X_test)
80)
print(name)
print("=" * 80)
print("Matriz de confusión (TEST)")
print(pd.DataFrame(confusion_matrix(y_test_encoded, y_pred), index=labels, columns=labels))
print("\nReporte (TEST)")
print(classification_report(y_test_encoded, y_pred, target_names=labels, zero_division=0))
print("\n" + "#" * 80)
print("EVALUACIÓN TOP-5: CV (AUC) en TRAIN")
print("#" * 80)
for i, row in top5.iterrows():
params_i = row["params"]
rf = RandomForestClassifier(
random_state=100 + i,
n_jobs=-1,
class_weight=class_weight_dict,
bootstrap=True,
**params_i
)
cv_out = cross_validate(
rf,
X_train,
y_train_encoded,
cv=rskf,
scoring=scoring_metric,
return_train_score=True,
n_jobs=-1
)
cv_train_mean = float(np.mean(cv_out["train_score"]))
cv_val_mean = float(np.mean(cv_out["test_score"]))
cv_val_std = float(np.std(cv_out["test_score"]))
gap = cv_train_mean - cv_val_mean
rf.fit(X_train, y_train_encoded)
metrics_holdout = evaluar_modelo(rf, X_train, y_train_encoded, X_test, y_test_encoded)
proba_train = rf.predict_proba(X_train)
proba_test = rf.predict_proba(X_test)
auc_train_holdout = roc_auc_score(
y_train_encoded, proba_train,
multi_class="ovr", average="weighted"
)
auc_test_holdout = roc_auc_score(
y_test_encoded, proba_test,
multi_class="ovr", average="weighted"
)
print_bloque_modelo("RF ROBUSTO TOP-5", i + 1, params_i, metrics_holdout, labels)
eval_test(rf, f"RF_rob_top{i+1} | params={params_i}")
resumen.append({
"modelo": f"RF_rob_top{i+1}",
"params": params_i,
"rs_mean_auc_cv": row["mean_test_score"],
"rs_std_auc_cv": row["std_test_score"],
"rs_gap_auc_cv": row["gap_cv_auc"],
"cv_train_mean_auc": cv_train_mean,
"cv_val_mean_auc": cv_val_mean,
"cv_val_std_auc": cv_val_std,
"cv_gap_auc": gap,
"train_auc": auc_train_holdout,
"test_auc": auc_test_holdout,
"test_acc": metrics_holdout["acc_test"],
"test_f1_weighted": metrics_holdout["f1_test"],
})
df_res = pd.DataFrame(resumen).sort_values(
["test_auc", "cv_val_mean_auc"], ascending=False
).reset_index(drop=True)
110)
print("RESUMEN (ordenado por AUC_test, luego CV val AUC)")
print("=" * 110)
print(df_res[[
"modelo", "params",
"rs_mean_auc_cv", "rs_std_auc_cv", "rs_gap_auc_cv",
"cv_train_mean_auc", "cv_val_mean_auc", "cv_val_std_auc", "cv_gap_auc",
"train_auc", "test_auc",
"test_acc", "test_f1_weighted"
]].to_string(index=False))
elapsed_seconds = int(time.time() - start_time)
hours, rem = divmod(elapsed_seconds, 3600)
minutes, seconds = divmod(rem, 60)
print(f"\nTiempo total: {hours:02d}:{minutes:02d}:{seconds:02d}")
#matrices de confusión
import time
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
start_time_save = time.time()
downloads = Path.home() / "Downloads"
if not downloads.exists():
downloads = Path.home() / "Descargas"
downloads.mkdir(parents=True, exist_ok=True)
labels = le.classes_
top5_params_in_order = [
("RF_rob_top1", {'criterion': 'entropy', 'max_depth': 15, 'max_features': 0.8, 'max_samples': 0.8,
'min_samples_leaf': 13, 'min_samples_split': 41, 'n_estimators': 3141}),
("RF_rob_top2", {'criterion': 'entropy', 'max_depth': 13, 'max_features': 0.7, 'max_samples': 0.6,
'min_samples_leaf': 10, 'min_samples_split': 42, 'n_estimators': 1414}),
("RF_rob_top3", {'criterion': 'gini', 'max_depth': 19, 'max_features': 0.3, 'max_samples': 0.6,
'min_samples_leaf': 10, 'min_samples_split': 34, 'n_estimators': 3091}),
("RF_rob_top4", {'criterion': 'entropy', 'max_depth': 13, 'max_features': 0.8, 'max_samples': 0.6,
'min_samples_leaf': 16, 'min_samples_split': 54, 'n_estimators': 3160}),
("RF_rob_top5", {'criterion': 'gini', 'max_depth': None, 'max_features': 0.3, 'max_samples': 0.6,
'min_samples_leaf': 6, 'min_samples_split': 29, 'n_estimators': 2465}),
]
def _row_normalize(cm):
cm = cm.astype(float)
row_sums = cm.sum(axis=1, keepdims=True)
row_sums[row_sums == 0] = 1.0
return (cm / row_sums) * 100.0
def save_confusion_train_test(cm_train, cm_test, labels, outpath,
title_prefix="", subtitle="",
gap_width=0.28, wspace=0.15,
label_fontsize=13, tick_fontsize=13, title_fontsize=14):
"""
Guarda TRAIN y TEST lado a lado:
- % por fila grande
- (conteo) pequeño debajo
"""
cm_tr_pct = _row_normalize(cm_train)
cm_te_pct = _row_normalize(cm_test)
fig = plt.figure(figsize=(10.8, 4.8))
gs = gridspec.GridSpec(
1, 4,
width_ratios=[1, gap_width, 1, 0.08],
wspace=wspace
)
ax1 = fig.add_subplot(gs[0, 0])
ax_gap = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])
ax_cbar = fig.add_subplot(gs[0, 3])
ax_gap.axis("off")
panels = [
(ax1, cm_tr_pct, cm_train, "Train"),
(ax2, cm_te_pct, cm_test, "Test"),
]
vmin, vmax = 0, 100
for ax, cm_pct, cm_cnt, t in panels:
im = ax.imshow(cm_pct, cmap="BuPu", vmin=vmin, vmax=vmax)
ax.set_title(t, fontsize=title_fontsize)
ax.set_xticks(np.arange(len(labels)))
ax.set_yticks(np.arange(len(labels)))
ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=tick_fontsize)
ax.set_yticklabels(labels, fontsize=tick_fontsize)
ax.set_xlabel("Predicho", fontsize=label_fontsize)
ax.set_ylabel("Real", fontsize=label_fontsize)
thr = 50
for i in range(cm_pct.shape[0]):
for j in range(cm_pct.shape[1]):
pct = cm_pct[i, j]
cnt = int(cm_cnt[i, j])
color_txt = "white" if pct > thr else "black"
ax.text(j, i - 0.10, f"{pct:.1f}%",
ha="center", va="center",
color=color_txt, fontsize=10, fontweight="bold")
ax.text(j, i + 0.22, f"({cnt})",
ha="center", va="center",
color=color_txt, fontsize=7)
fig.colorbar(im, cax=ax_cbar, label="% por fila (clase real)")
fig.suptitle(f"{title_prefix}\n{subtitle}", fontsize=13, y=0.98)
fig.subplots_adjust(left=0.08, right=0.92, bottom=0.22, top=0.82)
fig.savefig(outpath, dpi=300, bbox_inches="tight")
plt.close(fig)
for i, (name, params_i) in enumerate(top5_params_in_order):
rf = RandomForestClassifier(
random_state=100 + i,
n_jobs=-1,
class_weight=class_weight_dict,
bootstrap=True,
**params_i
)
rf.fit(X_train, y_train_encoded)
y_pred_tr = rf.predict(X_train)
y_pred_te = rf.predict(X_test)
cm_train = confusion_matrix(y_train_encoded, y_pred_tr)
cm_test = confusion_matrix(y_test_encoded, y_pred_te)
acc_tr = accuracy_score(y_train_encoded, y_pred_tr)
acc_te = accuracy_score(y_test_encoded, y_pred_te)
f1_tr = f1_score(y_train_encoded, y_pred_tr, average="weighted", zero_division=0)
f1_te = f1_score(y_test_encoded, y_pred_te, average="weighted", zero_division=0)
subtitle = f"Acc train={acc_tr:.3f} | Acc test={acc_te:.3f} | F1w train={f1_tr:.3f} | F1w test={f1_te:.3f}"
outpath = downloads / f"RF_IISIG_FIRMA_{i+1}.png"
save_confusion_train_test(
cm_train, cm_test, labels,
outpath=outpath,
title_prefix=f"{name} (RF_IISIG_FIRMA_{i+1})",
subtitle=subtitle,
gap_width=0.28,
wspace=0.15
)
print(f"Guardado: {outpath}")
elapsed = int(time.time() - start_time_save)
h, r = divmod(elapsed, 3600)
m, s = divmod(r, 60)
print(f"\nTiempo total guardando figuras: {h:02d}:{m:02d}:{s:02d}")
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
start_time_imp = time.time()
best_params_sig = {
'criterion': 'entropy',
'max_depth': 15,
'max_features': 0.8,
'max_samples': 0.8,
'min_samples_leaf': 13,
'min_samples_split': 41,
'n_estimators': 3141
}
rf_best = RandomForestClassifier(
random_state=100,
n_jobs=-1,
class_weight=class_weight_dict,
bootstrap=True,
**best_params_sig
)
rf_best.fit(X_train, y_train_encoded)
feat_idx = pd.to_numeric(pd.Index(X_train.columns).astype(str), errors="coerce")
if feat_idx.isna().any():
raise ValueError("Tus columnas no se pudieron convertir a números. Revisa X_train.columns.")
feat_idx = feat_idx.astype(int)
min_idx, max_idx = int(feat_idx.min()), int(feat_idx.max())
print(f"Rango original columnas: {min_idx} .. {max_idx}")
# Si parte en 0, lo pasamos a 1..N
if min_idx == 0:
feat_idx_1based = feat_idx + 1
print("Detectado 0-based -> usando idx_1based = idx + 1")
else:
feat_idx_1based = feat_idx
print(f"Rango idx_1based: {int(feat_idx_1based.min())} .. {int(feat_idx_1based.max())}")
edges = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
lvl_labels = [f"N{i}" for i in range(0, 10)]
niveles = pd.cut(feat_idx_1based, bins=edges, labels=lvl_labels, right=False, include_lowest=True)
if niveles.isna().any():
bad = np.array(X_train.columns)[niveles.isna()][:10]
raise ValueError(f"Aún hay features fuera de rango de bins. Ejemplos: {bad}")
importances = rf_best.feature_importances_
df = pd.DataFrame({
"feature": X_train.columns.astype(str),
"nivel": niveles.astype(str),
"importance": importances
})
res = df.groupby("nivel", as_index=False).agg(
n_features=("importance", "size"),
importancia_sum=("importance", "sum")
)
res["importancia_pct"] = 100 * res["importancia_sum"] / res["importancia_sum"].sum()
res["importancia_prom"] = res["importancia_sum"] / res["n_features"]
res["nivel_num"] = res["nivel"].str.replace("N", "", regex=False).astype(int)
res = res.sort_values("nivel_num").drop(columns="nivel_num").reset_index(drop=True)
print("\nIMPORTANCIA POR NIVEL — RF_rob_top1 (FIRMA NORMAL)")
print(res[["nivel","n_features","importancia_sum","importancia_pct","importancia_prom"]].to_string(index=False))
res = res[res["nivel"] != "N0"].reset_index(drop=True)
xpos = np.arange(len(res))
xticks_labels = [f"Nivel {n}" for n in res["nivel"].str.replace("N", "", regex=False)]
fig, ax1 = plt.subplots(figsize=(10.5, 5.2))
ax1.bar(xpos, res["importancia_pct"], edgecolor="black", alpha=0.9, color="plum")
ax1.set_ylabel("Porcentaje de importancia")
ax1.set_xlabel("Nivel")
ax1.set_title("Primer modelo — Importancia por nivel")
ax1.grid(axis="y", linestyle="--", alpha=0.5)
ax1.set_xticks(xpos)
ax1.set_xticklabels(xticks_labels, rotation=0)
ax2 = ax1.twinx()
ax2.plot(xpos, res["importancia_prom"], marker="o", color="purple")
ax2.set_ylabel("Importancia promedio")
plt.tight_layout()
plt.show()
elapsed = int(time.time() - start_time_imp)
h, r = divmod(elapsed, 3600)
m, s = divmod(r, 60)
print(f"\nTiempo total: {h:02d}:{m:02d}:{s:02d}")
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
best_params_sig = {
'criterion': 'entropy',
'max_depth': 15,
'max_features': 0.8,
'max_samples': 0.8,
'min_samples_leaf': 13,
'min_samples_split': 41,
'n_estimators': 3141
}
col_nums = pd.to_numeric(pd.Index(X_train.columns).astype(str), errors="coerce")
if col_nums.isna().any():
raise ValueError("X_train.columns no son numéricas (0..1022).")
col_nums = col_nums.astype(int)
cols_sorted = [c for _, c in sorted(zip(col_nums.values, X_train.columns), key=lambda t: t[0])]
nums_sorted = sorted(col_nums.values)
EXCLUDE_N0 = True
rows = []
for m in tqdm(range(1, 10), desc="Evaluando niveles firma (AUC)", unit="nivel"):
end_idx = (2 ** (m + 1)) - 2
if EXCLUDE_N0:
selected = [c for c, n in zip(cols_sorted, nums_sorted) if (1 <= n <= end_idx)]
else:
selected = [c for c, n in zip(cols_sorted, nums_sorted) if (0 <= n <= end_idx)]
Xtr_m = X_train[selected]
Xte_m = X_test[selected]
rf = RandomForestClassifier(
random_state=100,
n_jobs=-1,
class_weight=class_weight_dict,
bootstrap=True,
**best_params_sig
)
rf.fit(Xtr_m, y_train_encoded)
pred_tr = rf.predict(Xtr_m)
pred_te = rf.predict(Xte_m)
acc_tr = accuracy_score(y_train_encoded, pred_tr)
f1_tr = f1_score(y_train_encoded, pred_tr, average="weighted", zero_division=0)
acc_te = accuracy_score(y_test_encoded, pred_te)
f1_te = f1_score(y_test_encoded, pred_te, average="weighted", zero_division=0)
proba_tr = rf.predict_proba(Xtr_m)
proba_te = rf.predict_proba(Xte_m)
auc_tr = roc_auc_score(y_train_encoded, proba_tr, multi_class="ovr", average="weighted")
auc_te = roc_auc_score(y_test_encoded, proba_te, multi_class="ovr", average="weighted")
rows.append({
"NivelFirma": m,
"N_features": Xtr_m.shape[1],
"AccTrain": acc_tr,
"F1Train": f1_tr,
"AUCTrain": auc_tr,
"AccTest": acc_te,
"F1Test": f1_te,
"AUCTest": auc_te
})
df_levels = pd.DataFrame(rows)
best_auc = df_levels["AUCTest"].max()
tol = 1e-4
best_candidates = df_levels[df_levels["AUCTest"] >= best_auc - tol]
best_simple = best_candidates.sort_values(["NivelFirma"]).iloc[0]
print("RESULTADOS POR NIVEL — criterio AUC test")
print(df_levels.to_string(index=False, float_format=lambda x: f"{x:.3f}"))
print("NIVEL MÁS SIMPLE QUE MAXIMIZA AUC test")
print(best_simple.to_string())
print("FILAS LaTeX (sin AUC)")
for _, r in df_levels.iterrows():
print(f"{int(r['NivelFirma'])} & {int(r['N_features'])} & "
f"{r['AccTrain']:.3f} & {r['F1Train']:.3f} & {r['AccTest']:.3f} & {r['F1Test']:.3f} \\\\")
print("FILAS LaTeX (tabla EXTENDIDA")
for _, r in df_levels.iterrows():
print(f"{int(r['NivelFirma'])} & {int(r['N_features'])} & "
f"{r['AccTrain']:.3f} & {r['F1Train']:.3f} & {r['AccTest']:.3f} & {r['F1Test']:.3f} & {r['AUCTest']:.3f} \\\\")