ANALISIS DE CLUSTER EJEMPLO 2 EN PYTHON

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import pdist, squareform
from sklearn.datasets import load_iris
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Usamos la base de datos Iris como ejemplo
iris = load_iris()
X = iris.data
print(X)
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.9 1.5]
 [5.5 2.3 4.  1.3]
 [6.5 2.8 4.6 1.5]
 [5.7 2.8 4.5 1.3]
 [6.3 3.3 4.7 1.6]
 [4.9 2.4 3.3 1. ]
 [6.6 2.9 4.6 1.3]
 [5.2 2.7 3.9 1.4]
 [5.  2.  3.5 1. ]
 [5.9 3.  4.2 1.5]
 [6.  2.2 4.  1. ]
 [6.1 2.9 4.7 1.4]
 [5.6 2.9 3.6 1.3]
 [6.7 3.1 4.4 1.4]
 [5.6 3.  4.5 1.5]
 [5.8 2.7 4.1 1. ]
 [6.2 2.2 4.5 1.5]
 [5.6 2.5 3.9 1.1]
 [5.9 3.2 4.8 1.8]
 [6.1 2.8 4.  1.3]
 [6.3 2.5 4.9 1.5]
 [6.1 2.8 4.7 1.2]
 [6.4 2.9 4.3 1.3]
 [6.6 3.  4.4 1.4]
 [6.8 2.8 4.8 1.4]
 [6.7 3.  5.  1.7]
 [6.  2.9 4.5 1.5]
 [5.7 2.6 3.5 1. ]
 [5.5 2.4 3.8 1.1]
 [5.5 2.4 3.7 1. ]
 [5.8 2.7 3.9 1.2]
 [6.  2.7 5.1 1.6]
 [5.4 3.  4.5 1.5]
 [6.  3.4 4.5 1.6]
 [6.7 3.1 4.7 1.5]
 [6.3 2.3 4.4 1.3]
 [5.6 3.  4.1 1.3]
 [5.5 2.5 4.  1.3]
 [5.5 2.6 4.4 1.2]
 [6.1 3.  4.6 1.4]
 [5.8 2.6 4.  1.2]
 [5.  2.3 3.3 1. ]
 [5.6 2.7 4.2 1.3]
 [5.7 3.  4.2 1.2]
 [5.7 2.9 4.2 1.3]
 [6.2 2.9 4.3 1.3]
 [5.1 2.5 3.  1.1]
 [5.7 2.8 4.1 1.3]
 [6.3 3.3 6.  2.5]
 [5.8 2.7 5.1 1.9]
 [7.1 3.  5.9 2.1]
 [6.3 2.9 5.6 1.8]
 [6.5 3.  5.8 2.2]
 [7.6 3.  6.6 2.1]
 [4.9 2.5 4.5 1.7]
 [7.3 2.9 6.3 1.8]
 [6.7 2.5 5.8 1.8]
 [7.2 3.6 6.1 2.5]
 [6.5 3.2 5.1 2. ]
 [6.4 2.7 5.3 1.9]
 [6.8 3.  5.5 2.1]
 [5.7 2.5 5.  2. ]
 [5.8 2.8 5.1 2.4]
 [6.4 3.2 5.3 2.3]
 [6.5 3.  5.5 1.8]
 [7.7 3.8 6.7 2.2]
 [7.7 2.6 6.9 2.3]
 [6.  2.2 5.  1.5]
 [6.9 3.2 5.7 2.3]
 [5.6 2.8 4.9 2. ]
 [7.7 2.8 6.7 2. ]
 [6.3 2.7 4.9 1.8]
 [6.7 3.3 5.7 2.1]
 [7.2 3.2 6.  1.8]
 [6.2 2.8 4.8 1.8]
 [6.1 3.  4.9 1.8]
 [6.4 2.8 5.6 2.1]
 [7.2 3.  5.8 1.6]
 [7.4 2.8 6.1 1.9]
 [7.9 3.8 6.4 2. ]
 [6.4 2.8 5.6 2.2]
 [6.3 2.8 5.1 1.5]
 [6.1 2.6 5.6 1.4]
 [7.7 3.  6.1 2.3]
 [6.3 3.4 5.6 2.4]
 [6.4 3.1 5.5 1.8]
 [6.  3.  4.8 1.8]
 [6.9 3.1 5.4 2.1]
 [6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]]
# Creamos un DataFrame para facilitar la visualización, añadiendo el label real de la especie
df = pd.DataFrame(X, columns=iris.feature_names)
df['species'] = iris.target
df.head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) species
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
# ========================================================
# Cálculo de la matriz de distancias (usando la distancia Euclidiana)
# ========================================================

# pdist calcula las distancias en formato vectorial y squareform las convierte en matriz
dist_matrix = squareform(pdist(X, metric='euclidean'))

# Mostramos las primeras 5 filas y 5 columnas de la matriz de distancias
print("Matriz de distancias (primeras 5 filas y 5 columnas):")
print(pd.DataFrame(dist_matrix).iloc[:5, :5])
Matriz de distancias (primeras 5 filas y 5 columnas):
          0         1         2         3         4
0  0.000000  0.538516  0.509902  0.648074  0.141421
1  0.538516  0.000000  0.300000  0.331662  0.608276
2  0.509902  0.300000  0.000000  0.244949  0.509902
3  0.648074  0.331662  0.244949  0.000000  0.648074
4  0.141421  0.608276  0.509902  0.648074  0.000000
# ========================================================
# Análisis de clustering jerárquico con distintos métodos de enlace
# ========================================================
# Definimos los métodos de enlace:
linkage_methods = ["single", "complete", "average", "ward"]

# Se creará una figura con subgráficos para cada método
plt.figure(figsize=(20, 10))
for i, method in enumerate(linkage_methods):
    # Calculamos el enlace con el método actual
    Z = linkage(X, method=method)
    
    # Creamos el dendrograma; se truncan las hojas para una visualización más concisa.
    plt.subplot(2, 2, i + 1)
    dendrogram(Z, truncate_mode="lastp", p=30, leaf_rotation=90.)
    plt.title(f"Dendrograma usando el método {method}")
    plt.xlabel("Índice de clusters")
    plt.ylabel("Distancia")
    
    # Si usamos el método Ward (muy recomendado para datos euclídeos), guardamos sus clusters
    if method == "ward":
        # Cortamos el dendrograma para obtener 3 clusters (como se conoce en Iris)
        clusters_ward = fcluster(Z, 3, criterion="maxclust")

plt.tight_layout()
plt.show()

import seaborn as sns

# ========================================================
# Mapa de calor de la matriz de distancias
# ========================================================
plt.figure(figsize=(10, 8))
sns.heatmap(dist_matrix, cmap="viridis", cbar=True)
plt.title("Mapa de calor de la matriz de distancias euclídeas")
plt.xlabel("Individuos")
plt.ylabel("Individuos")
plt.show()

# ========================================================
# Dendrogramas separados para cada método de enlace
# ========================================================
plt.figure(figsize=(20, 15))
for i, method in enumerate(linkage_methods):
    Z = linkage(X, method=method)
    plt.subplot(2, 2, i + 1)
    dendrogram(Z, truncate_mode="lastp", p=30, leaf_rotation=90., leaf_font_size=10.)
    plt.title(f"Dendrograma usando el método {method}")
    plt.xlabel("Índice de clusters")
    plt.ylabel("Distancia")

plt.tight_layout()
plt.show()

# ========================================================
# Uso de Ward para asignar clusters y evaluación
# ========================================================
# Asignamos el cluster obtenido por el método Ward al DataFrame
df["cluster"] = clusters_ward

# Mostramos cuántos individuos hay en cada cluster
print("\nCantidad de individuos en cada cluster (método Ward):")
print(df["cluster"].value_counts())

# Muestra una tabla de asignación (primeros 10 registros)
print("\nTabla de asignación de clusters (primeros 10 registros):")
print(df.head(10))

Cantidad de individuos en cada cluster (método Ward):
cluster
3    64
1    50
2    36
Name: count, dtype: int64

Tabla de asignación de clusters (primeros 10 registros):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   
5                5.4               3.9                1.7               0.4   
6                4.6               3.4                1.4               0.3   
7                5.0               3.4                1.5               0.2   
8                4.4               2.9                1.4               0.2   
9                4.9               3.1                1.5               0.1   

   species  cluster  
0        0        1  
1        0        1  
2        0        1  
3        0        1  
4        0        1  
5        0        1  
6        0        1  
7        0        1  
8        0        1  
9        0        1  
# ========================================================
# Cálculo de métricas de evaluación
# ========================================================
# La métrica Silhouette evalúa qué tan compactos y bien separados están los clusters.
sil_score = silhouette_score(X, clusters_ward)
print(sil_score)
0.5543236611296417
# Como disponemos de las etiquetas reales (species), se puede calcular un índice de similitud (ARI)
ari = adjusted_rand_score(iris.target, clusters_ward)

print("\nMétricas de evaluación:")
print("Silhouette Score: {:.4f}".format(sil_score))
print("Adjusted Rand Index (ARI): {:.4f}".format(ari))

Métricas de evaluación:
Silhouette Score: 0.5543
Adjusted Rand Index (ARI): 0.7312
# ========================================================
# Visualización de clusters con reducción de dimensiones (PCA)
# ========================================================
# Estandarizamos los datos y aplicamos PCA para proyectarlos a 2 dimensiones
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_ward, cmap="viridis", alpha=0.7)
plt.title("Clusters visualizados en 2D tras PCA")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend(*scatter.legend_elements(), title="Cluster")
plt.show()

# ========================================================
# Pronóstico de un nuevo individuo (asignación a cluster)
# ========================================================
# Para 'predecir' el cluster de un nuevo individuo, se calculan los centróides de cada cluster.
# Se toman las medias de las características (solo se usan las variables originales)
centroids = df.groupby("cluster").mean().iloc[:, :4]
# Definimos un nuevo individuo (valores arbitrarios en la escala de Iris)
new_individual = np.array([5.7, 2.8, 4.1, 1.3])
    
# Función que asigna el cluster basado en la mínima distancia al centroide
def assign_cluster(new_data, centroids_df):
    distances = centroids_df.apply(lambda x: np.linalg.norm(x.values - new_data), axis=1)
    assigned_cluster = distances.idxmin()
    return assigned_cluster, distances

cluster_prediction, distances = assign_cluster(new_individual, centroids)
print("\nNuevo individuo (características):", new_individual)
print("Distancias a cada centroide:")
print(distances)
print("El nuevo individuo es asignado al Cluster:", cluster_prediction)

Nuevo individuo (características): [5.7 2.8 4.1 1.3]
Distancias a cada centroide:
cluster
1    2.990983
2    2.210300
3    0.414175
dtype: float64
El nuevo individuo es asignado al Cluster: 3
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# ========================================================
# Reducción de dimensiones con PCA para visualización
# ========================================================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Asignar clusters usando el método Ward
Z_ward = linkage(X, method="ward")
clusters_ward = fcluster(Z_ward, 3, criterion="maxclust")

# Crear un DataFrame para la visualización
df_viz = pd.DataFrame(X_pca, columns=["PCA 1", "PCA 2"])
df_viz["Cluster"] = clusters_ward.astype(str)  # Convertir a string para colores categóricos

# ========================================================
# Gráfico interactivo con Plotly
# ========================================================
fig = px.scatter(
    df_viz, x="PCA 1", y="PCA 2", color="Cluster",
    title="Visualización de Clusters (Método Ward)",
    labels={"Cluster": "Cluster"},
    color_discrete_sequence=px.colors.qualitative.Vivid
)
fig.update_traces(marker=dict(size=10, opacity=0.8), selector=dict(mode="markers"))
fig.update_layout(
    title_font_size=20,
    legend_title_font_size=14,
    xaxis_title="Componente Principal 1",
    yaxis_title="Componente Principal 2",
    template="plotly_white"
)

fig.show()
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from scipy.cluster.hierarchy import linkage, fcluster
import plotly.express as px

# Cargar la base de datos Iris
iris = load_iris()
X = iris.data

# Asignar clusters usando el método Ward
Z_ward = linkage(X, method="ward")
clusters_ward = fcluster(Z_ward, 3, criterion="maxclust")

# Crear un DataFrame con las características originales y los clusters asignados
df_viz_original = pd.DataFrame(X, columns=iris.feature_names)
df_viz_original["Cluster"] = clusters_ward.astype(str)  # Convertir a string para colores categóricos

# ========================================================
# Gráfico interactivo con las características originales
# ========================================================
fig = px.scatter_matrix(
    df_viz_original,
    dimensions=iris.feature_names,
    color="Cluster",
    title="Visualización de Clusters usando las características originales",
    labels={"Cluster": "Cluster"},
    color_discrete_sequence=px.colors.qualitative.Vivid
)
fig.update_traces(diagonal_visible=False, marker=dict(size=5, opacity=0.8))
fig.update_layout(
    title_font_size=20,
    legend_title_font_size=14,
    template="plotly_white"
)

fig.show()