import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import pdist, squareform
from sklearn.datasets import load_iris
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
ANALISIS DE CLUSTER EJEMPLO 2 EN PYTHON
# Usamos la base de datos Iris como ejemplo
= load_iris()
iris = iris.data X
print(X)
[[5.1 3.5 1.4 0.2]
[4.9 3. 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5. 3.6 1.4 0.2]
[5.4 3.9 1.7 0.4]
[4.6 3.4 1.4 0.3]
[5. 3.4 1.5 0.2]
[4.4 2.9 1.4 0.2]
[4.9 3.1 1.5 0.1]
[5.4 3.7 1.5 0.2]
[4.8 3.4 1.6 0.2]
[4.8 3. 1.4 0.1]
[4.3 3. 1.1 0.1]
[5.8 4. 1.2 0.2]
[5.7 4.4 1.5 0.4]
[5.4 3.9 1.3 0.4]
[5.1 3.5 1.4 0.3]
[5.7 3.8 1.7 0.3]
[5.1 3.8 1.5 0.3]
[5.4 3.4 1.7 0.2]
[5.1 3.7 1.5 0.4]
[4.6 3.6 1. 0.2]
[5.1 3.3 1.7 0.5]
[4.8 3.4 1.9 0.2]
[5. 3. 1.6 0.2]
[5. 3.4 1.6 0.4]
[5.2 3.5 1.5 0.2]
[5.2 3.4 1.4 0.2]
[4.7 3.2 1.6 0.2]
[4.8 3.1 1.6 0.2]
[5.4 3.4 1.5 0.4]
[5.2 4.1 1.5 0.1]
[5.5 4.2 1.4 0.2]
[4.9 3.1 1.5 0.2]
[5. 3.2 1.2 0.2]
[5.5 3.5 1.3 0.2]
[4.9 3.6 1.4 0.1]
[4.4 3. 1.3 0.2]
[5.1 3.4 1.5 0.2]
[5. 3.5 1.3 0.3]
[4.5 2.3 1.3 0.3]
[4.4 3.2 1.3 0.2]
[5. 3.5 1.6 0.6]
[5.1 3.8 1.9 0.4]
[4.8 3. 1.4 0.3]
[5.1 3.8 1.6 0.2]
[4.6 3.2 1.4 0.2]
[5.3 3.7 1.5 0.2]
[5. 3.3 1.4 0.2]
[7. 3.2 4.7 1.4]
[6.4 3.2 4.5 1.5]
[6.9 3.1 4.9 1.5]
[5.5 2.3 4. 1.3]
[6.5 2.8 4.6 1.5]
[5.7 2.8 4.5 1.3]
[6.3 3.3 4.7 1.6]
[4.9 2.4 3.3 1. ]
[6.6 2.9 4.6 1.3]
[5.2 2.7 3.9 1.4]
[5. 2. 3.5 1. ]
[5.9 3. 4.2 1.5]
[6. 2.2 4. 1. ]
[6.1 2.9 4.7 1.4]
[5.6 2.9 3.6 1.3]
[6.7 3.1 4.4 1.4]
[5.6 3. 4.5 1.5]
[5.8 2.7 4.1 1. ]
[6.2 2.2 4.5 1.5]
[5.6 2.5 3.9 1.1]
[5.9 3.2 4.8 1.8]
[6.1 2.8 4. 1.3]
[6.3 2.5 4.9 1.5]
[6.1 2.8 4.7 1.2]
[6.4 2.9 4.3 1.3]
[6.6 3. 4.4 1.4]
[6.8 2.8 4.8 1.4]
[6.7 3. 5. 1.7]
[6. 2.9 4.5 1.5]
[5.7 2.6 3.5 1. ]
[5.5 2.4 3.8 1.1]
[5.5 2.4 3.7 1. ]
[5.8 2.7 3.9 1.2]
[6. 2.7 5.1 1.6]
[5.4 3. 4.5 1.5]
[6. 3.4 4.5 1.6]
[6.7 3.1 4.7 1.5]
[6.3 2.3 4.4 1.3]
[5.6 3. 4.1 1.3]
[5.5 2.5 4. 1.3]
[5.5 2.6 4.4 1.2]
[6.1 3. 4.6 1.4]
[5.8 2.6 4. 1.2]
[5. 2.3 3.3 1. ]
[5.6 2.7 4.2 1.3]
[5.7 3. 4.2 1.2]
[5.7 2.9 4.2 1.3]
[6.2 2.9 4.3 1.3]
[5.1 2.5 3. 1.1]
[5.7 2.8 4.1 1.3]
[6.3 3.3 6. 2.5]
[5.8 2.7 5.1 1.9]
[7.1 3. 5.9 2.1]
[6.3 2.9 5.6 1.8]
[6.5 3. 5.8 2.2]
[7.6 3. 6.6 2.1]
[4.9 2.5 4.5 1.7]
[7.3 2.9 6.3 1.8]
[6.7 2.5 5.8 1.8]
[7.2 3.6 6.1 2.5]
[6.5 3.2 5.1 2. ]
[6.4 2.7 5.3 1.9]
[6.8 3. 5.5 2.1]
[5.7 2.5 5. 2. ]
[5.8 2.8 5.1 2.4]
[6.4 3.2 5.3 2.3]
[6.5 3. 5.5 1.8]
[7.7 3.8 6.7 2.2]
[7.7 2.6 6.9 2.3]
[6. 2.2 5. 1.5]
[6.9 3.2 5.7 2.3]
[5.6 2.8 4.9 2. ]
[7.7 2.8 6.7 2. ]
[6.3 2.7 4.9 1.8]
[6.7 3.3 5.7 2.1]
[7.2 3.2 6. 1.8]
[6.2 2.8 4.8 1.8]
[6.1 3. 4.9 1.8]
[6.4 2.8 5.6 2.1]
[7.2 3. 5.8 1.6]
[7.4 2.8 6.1 1.9]
[7.9 3.8 6.4 2. ]
[6.4 2.8 5.6 2.2]
[6.3 2.8 5.1 1.5]
[6.1 2.6 5.6 1.4]
[7.7 3. 6.1 2.3]
[6.3 3.4 5.6 2.4]
[6.4 3.1 5.5 1.8]
[6. 3. 4.8 1.8]
[6.9 3.1 5.4 2.1]
[6.7 3.1 5.6 2.4]
[6.9 3.1 5.1 2.3]
[5.8 2.7 5.1 1.9]
[6.8 3.2 5.9 2.3]
[6.7 3.3 5.7 2.5]
[6.7 3. 5.2 2.3]
[6.3 2.5 5. 1.9]
[6.5 3. 5.2 2. ]
[6.2 3.4 5.4 2.3]
[5.9 3. 5.1 1.8]]
# Creamos un DataFrame para facilitar la visualización, añadiendo el label real de la especie
= pd.DataFrame(X, columns=iris.feature_names)
df 'species'] = iris.target df[
df.head()
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 |
# ========================================================
# Cálculo de la matriz de distancias (usando la distancia Euclidiana)
# ========================================================
# pdist calcula las distancias en formato vectorial y squareform las convierte en matriz
= squareform(pdist(X, metric='euclidean'))
dist_matrix
# Mostramos las primeras 5 filas y 5 columnas de la matriz de distancias
print("Matriz de distancias (primeras 5 filas y 5 columnas):")
print(pd.DataFrame(dist_matrix).iloc[:5, :5])
Matriz de distancias (primeras 5 filas y 5 columnas):
0 1 2 3 4
0 0.000000 0.538516 0.509902 0.648074 0.141421
1 0.538516 0.000000 0.300000 0.331662 0.608276
2 0.509902 0.300000 0.000000 0.244949 0.509902
3 0.648074 0.331662 0.244949 0.000000 0.648074
4 0.141421 0.608276 0.509902 0.648074 0.000000
# ========================================================
# Análisis de clustering jerárquico con distintos métodos de enlace
# ========================================================
# Definimos los métodos de enlace:
= ["single", "complete", "average", "ward"]
linkage_methods
# Se creará una figura con subgráficos para cada método
=(20, 10))
plt.figure(figsizefor i, method in enumerate(linkage_methods):
# Calculamos el enlace con el método actual
= linkage(X, method=method)
Z
# Creamos el dendrograma; se truncan las hojas para una visualización más concisa.
2, 2, i + 1)
plt.subplot(="lastp", p=30, leaf_rotation=90.)
dendrogram(Z, truncate_modef"Dendrograma usando el método {method}")
plt.title("Índice de clusters")
plt.xlabel("Distancia")
plt.ylabel(
# Si usamos el método Ward (muy recomendado para datos euclídeos), guardamos sus clusters
if method == "ward":
# Cortamos el dendrograma para obtener 3 clusters (como se conoce en Iris)
= fcluster(Z, 3, criterion="maxclust")
clusters_ward
plt.tight_layout() plt.show()
import seaborn as sns
# ========================================================
# Mapa de calor de la matriz de distancias
# ========================================================
=(10, 8))
plt.figure(figsize="viridis", cbar=True)
sns.heatmap(dist_matrix, cmap"Mapa de calor de la matriz de distancias euclídeas")
plt.title("Individuos")
plt.xlabel("Individuos")
plt.ylabel( plt.show()
# ========================================================
# Dendrogramas separados para cada método de enlace
# ========================================================
=(20, 15))
plt.figure(figsizefor i, method in enumerate(linkage_methods):
= linkage(X, method=method)
Z 2, 2, i + 1)
plt.subplot(="lastp", p=30, leaf_rotation=90., leaf_font_size=10.)
dendrogram(Z, truncate_modef"Dendrograma usando el método {method}")
plt.title("Índice de clusters")
plt.xlabel("Distancia")
plt.ylabel(
plt.tight_layout() plt.show()
# ========================================================
# Uso de Ward para asignar clusters y evaluación
# ========================================================
# Asignamos el cluster obtenido por el método Ward al DataFrame
"cluster"] = clusters_ward
df[
# Mostramos cuántos individuos hay en cada cluster
print("\nCantidad de individuos en cada cluster (método Ward):")
print(df["cluster"].value_counts())
# Muestra una tabla de asignación (primeros 10 registros)
print("\nTabla de asignación de clusters (primeros 10 registros):")
print(df.head(10))
Cantidad de individuos en cada cluster (método Ward):
cluster
3 64
1 50
2 36
Name: count, dtype: int64
Tabla de asignación de clusters (primeros 10 registros):
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
5 5.4 3.9 1.7 0.4
6 4.6 3.4 1.4 0.3
7 5.0 3.4 1.5 0.2
8 4.4 2.9 1.4 0.2
9 4.9 3.1 1.5 0.1
species cluster
0 0 1
1 0 1
2 0 1
3 0 1
4 0 1
5 0 1
6 0 1
7 0 1
8 0 1
9 0 1
# ========================================================
# Cálculo de métricas de evaluación
# ========================================================
# La métrica Silhouette evalúa qué tan compactos y bien separados están los clusters.
= silhouette_score(X, clusters_ward) sil_score
print(sil_score)
0.5543236611296417
# Como disponemos de las etiquetas reales (species), se puede calcular un índice de similitud (ARI)
= adjusted_rand_score(iris.target, clusters_ward)
ari
print("\nMétricas de evaluación:")
print("Silhouette Score: {:.4f}".format(sil_score))
print("Adjusted Rand Index (ARI): {:.4f}".format(ari))
Métricas de evaluación:
Silhouette Score: 0.5543
Adjusted Rand Index (ARI): 0.7312
# ========================================================
# Visualización de clusters con reducción de dimensiones (PCA)
# ========================================================
# Estandarizamos los datos y aplicamos PCA para proyectarlos a 2 dimensiones
= StandardScaler()
scaler = scaler.fit_transform(X)
X_scaled = PCA(n_components=2)
pca = pca.fit_transform(X_scaled)
X_pca
=(8, 6))
plt.figure(figsize= plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_ward, cmap="viridis", alpha=0.7)
scatter "Clusters visualizados en 2D tras PCA")
plt.title("PCA 1")
plt.xlabel("PCA 2")
plt.ylabel(*scatter.legend_elements(), title="Cluster")
plt.legend( plt.show()
# ========================================================
# Pronóstico de un nuevo individuo (asignación a cluster)
# ========================================================
# Para 'predecir' el cluster de un nuevo individuo, se calculan los centróides de cada cluster.
# Se toman las medias de las características (solo se usan las variables originales)
= df.groupby("cluster").mean().iloc[:, :4] centroids
# Definimos un nuevo individuo (valores arbitrarios en la escala de Iris)
= np.array([5.7, 2.8, 4.1, 1.3])
new_individual
# Función que asigna el cluster basado en la mínima distancia al centroide
def assign_cluster(new_data, centroids_df):
= centroids_df.apply(lambda x: np.linalg.norm(x.values - new_data), axis=1)
distances = distances.idxmin()
assigned_cluster return assigned_cluster, distances
= assign_cluster(new_individual, centroids)
cluster_prediction, distances print("\nNuevo individuo (características):", new_individual)
print("Distancias a cada centroide:")
print(distances)
print("El nuevo individuo es asignado al Cluster:", cluster_prediction)
Nuevo individuo (características): [5.7 2.8 4.1 1.3]
Distancias a cada centroide:
cluster
1 2.990983
2 2.210300
3 0.414175
dtype: float64
El nuevo individuo es asignado al Cluster: 3
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# ========================================================
# Reducción de dimensiones con PCA para visualización
# ========================================================
= StandardScaler()
scaler = scaler.fit_transform(X)
X_scaled = PCA(n_components=2)
pca = pca.fit_transform(X_scaled)
X_pca
# Asignar clusters usando el método Ward
= linkage(X, method="ward")
Z_ward = fcluster(Z_ward, 3, criterion="maxclust")
clusters_ward
# Crear un DataFrame para la visualización
= pd.DataFrame(X_pca, columns=["PCA 1", "PCA 2"])
df_viz "Cluster"] = clusters_ward.astype(str) # Convertir a string para colores categóricos
df_viz[
# ========================================================
# Gráfico interactivo con Plotly
# ========================================================
= px.scatter(
fig ="PCA 1", y="PCA 2", color="Cluster",
df_viz, x="Visualización de Clusters (Método Ward)",
title={"Cluster": "Cluster"},
labels=px.colors.qualitative.Vivid
color_discrete_sequence
)=dict(size=10, opacity=0.8), selector=dict(mode="markers"))
fig.update_traces(marker
fig.update_layout(=20,
title_font_size=14,
legend_title_font_size="Componente Principal 1",
xaxis_title="Componente Principal 2",
yaxis_title="plotly_white"
template
)
fig.show()
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from scipy.cluster.hierarchy import linkage, fcluster
import plotly.express as px
# Cargar la base de datos Iris
= load_iris()
iris = iris.data
X
# Asignar clusters usando el método Ward
= linkage(X, method="ward")
Z_ward = fcluster(Z_ward, 3, criterion="maxclust")
clusters_ward
# Crear un DataFrame con las características originales y los clusters asignados
= pd.DataFrame(X, columns=iris.feature_names)
df_viz_original "Cluster"] = clusters_ward.astype(str) # Convertir a string para colores categóricos
df_viz_original[
# ========================================================
# Gráfico interactivo con las características originales
# ========================================================
= px.scatter_matrix(
fig
df_viz_original,=iris.feature_names,
dimensions="Cluster",
color="Visualización de Clusters usando las características originales",
title={"Cluster": "Cluster"},
labels=px.colors.qualitative.Vivid
color_discrete_sequence
)=False, marker=dict(size=5, opacity=0.8))
fig.update_traces(diagonal_visible
fig.update_layout(=20,
title_font_size=14,
legend_title_font_size="plotly_white"
template
)
fig.show()