Tarea5_Rodrigo

import os
import numpy as np
import pandas as pd
from math import pi
import No_Supervisados as ns
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram, ward, single, complete,average,linkage, fcluster
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def centroide(num_cluster, datos, clusters):
    ind = clusters == num_cluster
    return(pd.DataFrame(datos[ind].mean()).T)
  
def recodificar(col, nuevo_codigo):
    col_cod = pd.Series(col, copy=True)
    for llave, valor in nuevo_codigo.items():
        col_cod.replace(llave, valor, inplace=True)
    return col_cod

def bar_plot(centros, labels, scale = False,cluster = None, var = None):
    from math import ceil, floor
    from seaborn import color_palette
    
    centros = np.copy(centros)
    
    if scale:
        for col in range(centros.shape[1]):
            centros[:,col] /= max(centros[:,col])
    
    colores = color_palette()
    minimo = floor(centros.min()) if floor(centros.min()) < 0 else 0
    def inside_plot(valores, labels, titulo):
        plt.barh(range(len(valores)), valores, 1/1.5, color = colores)
        plt.xlim(minimo, ceil(centros.max()))
        plt.title(titulo)
    if var is not None:
        centros = np.array([n[[x in var for x in labels]] for n in centros])
        colores = [colores[x % len(colores)] for x, i in enumerate(labels) if i in var]
        labels = labels[[x in var for x in labels]]
    if cluster is None:
        for i in range(centros.shape[0]):
            plt.subplot(1, centros.shape[0], i + 1)
            inside_plot(centros[i].tolist(), labels, ('Cluster ' + str(i)))
            plt.yticks(range(len(labels)), labels) if i == 0 else plt.yticks([]) 
    else:
        pos = 1
        for i in cluster:
            plt.subplot(1, len(cluster), pos)
            inside_plot(centros[i].tolist(), labels, ('Cluster ' + str(i)))
            plt.yticks(range(len(labels)), labels) if pos == 1 else plt.yticks([]) 
            pos += 1
            
def bar_plot_detail(centros,columns_names = [], columns_to_plot = [],figsize = (10,7),dpi = 150):
  from math import ceil, floor
  import seaborn as sb
  numClusters = centros.shape[0]
  labels = ["Cluster "+ str(i) for i in range(numClusters)]
  centros = pd.DataFrame(centros,columns=columns_names,index= labels)
  
  plots = len(columns_to_plot) if len(columns_to_plot) != 0 else len(columns_names)
  rows, cols = ceil(plots/2),2
  
  plt.figure(1, figsize = figsize,dpi = dpi)
  plt.subplots_adjust(hspace=1,wspace = 0.5)
  columns = columns_names
  if len(columns_to_plot) > 0: 
    if type(columns_to_plot[0]) is str:
      columns = columns_to_plot
    else:
      columns = [columns_names[i] for i in columns_to_plot]
  var = 0
  for numRow in range(rows):
    for numCol in range(cols):
      if var < plots:
        ax = plt.subplot2grid((rows, cols), (numRow, numCol), colspan=1, rowspan=1)
        sb.barplot(y = labels, x=columns[var] ,data=centros ,ax=ax)
        var += 1    

def radar_plot(centros, labels):
    from math import pi
    centros = np.array([((n - min(n)) / (max(n) - min(n)) * 100) if 
                        max(n) != min(n) else (n/n * 50) for n in centros.T])
    angulos = [n / float(len(labels)) * 2 * pi for n in range(len(labels))]
    angulos += angulos[:1]
    ax = plt.subplot(111, polar = True)
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)
    
    plt.xticks(angulos[:-1], labels)
    ax.set_rlabel_position(0)
    plt.yticks([10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
           ["10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "100%"], 
           color = "grey", size = 8)
    plt.ylim(-10, 100)
    for i in range(centros.shape[1]):
        valores = centros[:, i].tolist()
        valores += valores[:1]
        ax.plot(angulos, valores, linewidth = 1, linestyle = 'solid', 
                label = 'Cluster ' + str(i))
        ax.fill(angulos, valores, alpha = 0.3)
    plt.legend(loc='upper right', bbox_to_anchor = (0.1, 0.1))

#1

datos = pd.read_csv("C:/Users/Rodrigo/Desktop/TEC/Concentracion/datos/country_indicators.csv",index_col=0)
#print(datos)

##a

dist = pdist(datos, metric="euclidean")
#print(dist)

ward_res = ward(dist)         
promedio = average(dist)   
salto_min  = single(dist)     
salto_max = complete(dist) 

fig, ax = plt.subplots(1, 1, figsize = (15,8), dpi = 200)
res = dendrogram(ward_res, labels = datos.index.tolist())
limites = ax.get_xbound()
ax.plot(limites, [300000, 300000], '--', c='k')
ax.plot(limites, [150000, 150000], '--', c='k')
ax.text(limites[1], 300000, ' dos clústeres', va = 'center', fontdict={'size': 15})
ax.text(limites[1], 150000, ' tres clústeres',   va = 'center', fontdict={'size': 15})
ax.set_xlabel("Orden en el eje X")
ax.set_ylabel("Distancia o Agregación")
plt.show()

fig, ax = plt.subplots(1, 1, figsize = (15,8), dpi = 200)
res = dendrogram(promedio, labels = datos.index.tolist())
limites = ax.get_xbound()
ax.plot(limites, [80000, 80000], '--', c='k')
ax.plot(limites, [59000, 59000], '--', c='k')
ax.text(limites[1], 80000, ' dos clústeres', va = 'center', fontdict={'size': 15})
ax.text(limites[1], 59000, ' tres clústeres',   va = 'center', fontdict={'size': 15})
ax.set_xlabel("Orden en el eje X")
ax.set_ylabel("Distancia o Agregación")
plt.show()

fig, ax = plt.subplots(1, 1, figsize = (15,8), dpi = 200)
res = dendrogram(salto_min, labels = datos.index.tolist())
limites = ax.get_xbound()
ax.plot(limites, [40000, 40000], '--', c='k')
ax.plot(limites, [30000, 30000], '--', c='k')
ax.text(limites[1], 40000, ' dos clústeres', va = 'center', fontdict={'size': 15})
ax.text(limites[1], 30000, ' tres clústeres',   va = 'center', fontdict={'size': 15})
ax.set_xlabel("Orden en el eje X")
ax.set_ylabel("Distancia o Agregación")
plt.show()

fig, ax = plt.subplots(1, 1, figsize = (15,8), dpi = 200)
res = dendrogram(salto_max, labels = datos.index.tolist())
limites = ax.get_xbound()
ax.plot(limites, [120000, 120000], '--', c='k')
ax.plot(limites, [80000, 80000], '--', c='k')
ax.text(limites[1], 120000, ' dos clústeres', va = 'center', fontdict={'size': 15})
ax.text(limites[1], 80000, ' tres clústeres',   va = 'center', fontdict={'size': 15})
ax.set_xlabel("Orden en el eje X")
ax.set_ylabel("Distancia o Agregación")
plt.show()


##b

grupos = fcluster(linkage(datos, method = 'ward', metric='euclidean'), 3, criterion = 'maxclust')
grupos = grupos-1 
#print(grupos)
centros = np.array(pd.concat([ns.centroide(0, datos, grupos), 
                              ns.centroide(1, datos, grupos),
                              ns.centroide(2, datos, grupos)]))

ns.bar_plot(centros, datos.columns,scale=True)
plt.show()

ns.radar_plot(centros, datos.columns)
plt.show()

### COn la agregacion Ward, y con las graficas podemos ver que los clusteres se dividen de la siguiente manera, 1: child mort, total fer, y Inflation; lo que nos puede decir que son sobre la tasa de poblacion y su rendimiento economico. 2: imports y exports, el cual se refiere al porcentaje per capita de las importaciones y exportaciones. Y el 3: gdpp, Income y Health, el cual se refiere al rendimiento economico. 

##c

datos_scaled = pd.DataFrame(StandardScaler().fit_transform(datos),columns=datos.columns,
                                   index = datos.index)
pca = PCA(n_components = 2)  
componentes = pca.fit_transform(datos_scaled)


fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (10,6), dpi = 200)
ax.set_xlabel("Componente 1")
ax.set_ylabel("Componente 2")
for i in range(componentes.shape[0]):
    x = componentes[i, 0]
    y = componentes[i, 1]
    color = 'blue' if grupos[i] == 0 else 'red'  # Asignar colores según el clúster
    ax.scatter(x, y, color=color)
    #ax.annotate(text=datos_scaled.index[i], xy=(x+0.05,y))
plt.show()

#2

data = pd.read_csv("C:/Users/Rodrigo/Desktop/TEC/Concentracion/datos/VirtualPatient.csv")

##a
numericos = data.select_dtypes(include=['float64', 'int64'])
distancia = pdist(numericos, metric="euclidean")
ward_distance = ward(distancia)


grupos = fcluster(linkage(numericos, method = 'ward', metric='euclidean'), 3, criterion = 'maxclust')
grupos = grupos - 1 

centros = np.array(pd.concat([ns.centroide(0, numericos, grupos), 
                              ns.centroide(1, numericos, grupos),
                              ns.centroide(2, numericos, grupos)]))


ns.bar_plot(centros, numericos.columns,scale=True)
plt.show()

ns.radar_plot(centros, numericos.columns)
plt.show()

datos_scaled = pd.DataFrame(StandardScaler().fit_transform(numericos), columns=numericos.columns, index=numericos.index)

analisis = PCA(n_components = 2)
componentes = analisis.fit_transform(datos_scaled)

fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (10,6), dpi = 200)
ax.set_xlabel("Componente 1")
ax.set_ylabel("Componente 2")

import matplotlib as mpl
import matplotlib.pyplot as plt

cmap = mpl.cm.get_cmap('viridis')

## <string>:1: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.

for i in range(componentes.shape[0]):
    x = componentes[i, 0]
    y = componentes[i, 1]
    color = cmap(grupos[i]/max(grupos))
    ax.scatter(x, y, color=color)
    ax.annotate(text=datos_scaled.index[i], xy=(x + 0.05, y))
    
datos_scaled

##b

##           age  hospitalization_three_years  ...  life_quality  social_visits
## 0    0.367728                    -0.722995  ...     -1.109116      -0.206681
## 1    0.656480                    -0.722995  ...     -0.846696       5.314553
## 2    0.656480                    -0.722995  ...     -1.305931      -0.206681
## 3    0.945233                    -0.722995  ...     -1.043511      -0.206681
## 4   -1.364787                     2.902310  ...     -0.781091      -0.201149
## ..        ...                          ...  ...           ...            ...
## 112 -0.498530                    -0.722995  ...     -0.321857      -0.173487
## 113  0.367728                    -0.722995  ...      0.334193      -0.173487
## 114  0.367728                    -0.722995  ...      0.990242      -0.173487
## 115  0.367728                    -0.722995  ...      0.990242      -0.173487
## 116  0.656480                    -0.722995  ...     -0.321857      -0.190084
## 
## [117 rows x 9 columns]

numerics = data.select_dtypes(include=['float64', 'int64'])
vision_onehot = pd.get_dummies(data["vision"], prefix="vision")
datos = pd.concat([numerics, vision_onehot], axis=1)

datos_s = pd.DataFrame(StandardScaler().fit_transform(datos), columns=datos.columns, index=datos.index)


dist = pdist(datos_s, metric="euclidean")
warddist = ward(dist)
grupos = fcluster(warddist, 3, criterion="maxclust") - 1
centros = np.array(pd.concat([ns.centroide(0, datos_s, grupos),
                              ns.centroide(1, datos_s, grupos),
                              ns.centroide(2, datos_s, grupos)]))

anacomp = PCA(n_components=2)
componentes = anacomp.fit_transform(datos_s)

ns.bar_plot(centros, datos_s.columns,scale=True)
plt.show()

ns.radar_plot(centros, datos_s.columns)
plt.show()

fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (10,6), dpi = 200)
ax.set_xlabel("Componente 1")
ax.set_ylabel("Componente 2")

cmap = plt.get_cmap('viridis')
norm = plt.Normalize(vmin=0, vmax=max(grupos))

for i in range(componentes.shape[0]):
    x = componentes[i, 0]
    y = componentes[i, 1]
    color = cmap(norm(grupos[i]))
    ax.scatter(x, y, color=color, s=50, alpha=0.7, edgecolors='none')
    ax.annotate(text=datos_scaled.index[i], xy=(x + 0.05, y))
    
for i, group in enumerate(np.unique(grupos)):
    ind = np.where(grupos == group)[0]
    ax.scatter(componentes[ind, 0], componentes[ind, 1], s=50, alpha=0.7, color=cmap(norm(i)), label=f"Grupo {group+1}")



ax.legend(fontsize=12, loc='upper left', bbox_to_anchor=(1.05, 1.0))
plt.title('Gráfico de dispersión con PCA', fontsize=16)
plt.show()

##c
### Yo pienso que esta mejor el segundo ejercicio, agregando la variable "Vision" ya que genera mejor separacion de los clusters, ademas de que creo que hay mas informacion más valiosa, sin embargo, a diferencia del primer ejercio, hay valores negativos, lo que lo hace dificil de comprender debido a que son dictancias y las distancias negativas son muy raras.

#3
##a
def city_block(p1, p2):

    distancia = 0
    for i in range(len(p1)):
        distancia += abs(p1[i] - p2[i])
    return distancia

##b
estudiantes = pd.read_csv("C:/Users/Rodrigo/Desktop/TEC/Concentracion/datos/EjemploEstudiantes.csv",delimiter=';',decimal=",",index_col=0)

from scipy.spatial.distance import cdist
matdis = cdist(estudiantes.values, estudiantes.values, metric=city_block)
print(matdis)


##c

## [[ 0.   7.9  5.6  7.3  3.9  7.5  3.   8.4  8.   2.8]
##  [ 7.9  0.   2.5  8.2  7.6  2.4  7.3  2.9  8.9  6.3]
##  [ 5.6  2.5  0.   9.3  6.1  1.9  5.6  2.8 10.   5. ]
##  [ 7.3  8.2  9.3  0.   6.4 10.6  6.9 11.1  3.3  8.7]
##  [ 3.9  7.6  6.1  6.4  0.   7.2  1.3  8.1  6.1  3.3]
##  [ 7.5  2.4  1.9 10.6  7.2  0.   6.9  0.9 11.3  5.9]
##  [ 3.   7.3  5.6  6.9  1.3  6.9  0.   7.8  7.4  2. ]
##  [ 8.4  2.9  2.8 11.1  8.1  0.9  7.8  0.  11.8  6.8]
##  [ 8.   8.9 10.   3.3  6.1 11.3  7.4 11.8  0.   9.4]
##  [ 2.8  6.3  5.   8.7  3.3  5.9  2.   6.8  9.4  0. ]]

import scipy.cluster.hierarchy as sch

dist = pdist(estudiantes, metric=city_block)
enlaces = sch.linkage(dist, method='complete', metric=city_block)
grupos = sch.fcluster(enlaces, 3, criterion='maxclust') - 1
centros = np.array(pd.concat([ns.centroide(0, estudiantes, grupos), 
                              ns.centroide(1, estudiantes, grupos),
                              ns.centroide(2, estudiantes, grupos)]))

ns.bar_plot(centros, estudiantes.columns)
plt.show()

## Traceback (most recent call last):
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\backends\backend_qt.py", line 468, in _draw_idle
##     self.draw()
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\backends\backend_agg.py", line 400, in draw
##     self.figure.draw(self.renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\artist.py", line 95, in draw_wrapper
##     result = draw(artist, renderer, *args, **kwargs)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\artist.py", line 72, in draw_wrapper
##     return draw(artist, renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\figure.py", line 3140, in draw
##     mimage._draw_list_compositing_images(
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\image.py", line 131, in _draw_list_compositing_images
##     a.draw(renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\artist.py", line 72, in draw_wrapper
##     return draw(artist, renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\axes\_base.py", line 3028, in draw
##     self._update_title_position(renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\axes\_base.py", line 2961, in _update_title_position
##     if (ax.xaxis.get_ticks_position() in ['top', 'unknown']
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\axis.py", line 2451, in get_ticks_position
##     self._get_ticks_position()]
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\axis.py", line 2156, in _get_ticks_position
##     minor = self.minorTicks[0]
## IndexError: list index out of range

ns.radar_plot(centros, estudiantes.columns)
plt.show()

dist_eucl = pdist(estudiantes, metric="euclidean")
enlaces_eucl = sch.linkage(dist_eucl, method='complete', metric='euclidean')
grups_eucl = sch.fcluster(enlaces_eucl, 3, criterion='maxclust') - 1
cent_eucl = np.array(pd.concat([ns.centroide(0, estudiantes, grups_eucl), 
                                          ns.centroide(1, estudiantes, grups_eucl),
                                          ns.centroide(2, estudiantes, grups_eucl)]))
ns.bar_plot(cent_eucl, estudiantes.columns)
plt.show()

## Traceback (most recent call last):
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\backends\backend_qt.py", line 468, in _draw_idle
##     self.draw()
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\backends\backend_agg.py", line 400, in draw
##     self.figure.draw(self.renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\artist.py", line 95, in draw_wrapper
##     result = draw(artist, renderer, *args, **kwargs)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\artist.py", line 72, in draw_wrapper
##     return draw(artist, renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\figure.py", line 3140, in draw
##     mimage._draw_list_compositing_images(
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\image.py", line 131, in _draw_list_compositing_images
##     a.draw(renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\artist.py", line 72, in draw_wrapper
##     return draw(artist, renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\axes\_base.py", line 3028, in draw
##     self._update_title_position(renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\axes\_base.py", line 2961, in _update_title_position
##     if (ax.xaxis.get_ticks_position() in ['top', 'unknown']
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\axis.py", line 2451, in get_ticks_position
##     self._get_ticks_position()]
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\axis.py", line 2155, in _get_ticks_position
##     major = self.majorTicks[0]
## IndexError: list index out of range

ns.radar_plot(cent_eucl, estudiantes.columns)
plt.show()

### Al comparar las graficas hechas por city_block y euclidean, los resultados pueron los mismos, las graficas tienen los mismos resultados, por lo que se puede decir que la metrica euclidean se basa en la metrica city_block para hacer las distancias.

Tarea5_Rodrigo_Rioseco

Rodrigo_Rioseco

2023-04-06