Boston Houses Price¶

En esta publicación estaré explorando un dataset público con características de casas de la ciudad de Boston, USA. Comenzaré realizando un análisis exploratorio y luego aplicaré un PCA para reducir la dimensionalidad y ver las relaciones existentes entre las variables y las observaciones

Importando Librerías

import pandas as pd
import numpy as np
from pydataset import data
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
import matplotlib as plt
import matplotlib.pyplot as plt
import seaborn as sns
import tabulate
from tabulate import tabulate
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
import IPython
import mpl_axes_aligner
%matplotlib inline

Importando Dataset

Este dataset se encuentra dentro de la librería pydataset

df = data('Boston')

df.head()

Análisis Exploratorio¶

Dimensiones

print("El dataset contiene",df.shape[0],"filas y ",df.shape[1],"columnas")

El dataset contiene 506 filas y  14 columnas

Revisión nombre de columnas y posición

for i in df.columns:
    print("columna",df.columns.get_loc(i),":",i)

columna 0 : crim
columna 1 : zn
columna 2 : indus
columna 3 : chas
columna 4 : nox
columna 5 : rm
columna 6 : age
columna 7 : dis
columna 8 : rad
columna 9 : tax
columna 10 : ptratio
columna 11 : black
columna 12 : lstat
columna 13 : medv

Revisión de tipo de datos

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506 entries, 1 to 506
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  black    506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 59.3 KB

Estadísticos Descriptivos

df.describe()

Gráfico de Promedios por variable

df.mean(axis=0).sort_values(ascending=False).plot.bar()

<AxesSubplot:>

df.std(axis=0).sort_values(ascending=False).plot.bar(color = "red")

<AxesSubplot:>

Por medio de estos gráficos podemos ver que la variable tax es una de las que tiene mayor promedio y varianza

Chequeo de valores nulos

df.isnull().any()

crim       False
zn         False
indus      False
chas       False
nox        False
rm         False
age        False
dis        False
rad        False
tax        False
ptratio    False
black      False
lstat      False
medv       False
dtype: bool

df.isnull().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
black      0
lstat      0
medv       0
dtype: int64

Chequeo valores NA

df.isna().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
black      0
lstat      0
medv       0
dtype: int64

Chequeo valores duplicados

if df.duplicated().any() == False:
    print("No hay filas duplicados")

No hay filas duplicados

Loop para graficar variables

Gráfico histograma + density

for i, col in enumerate(df.columns):
    plt.figure(i)
    sns.distplot(df[col])

Correlación entre variables

correlacion = df.corr().round(2)

correlacion.style.background_gradient(cmap='coolwarm')

Creación de Pipeline para realizar la estandarización Z y aplicar el modelo PCA con 3 componentes:

Estandarización depca_pipe = make_pipeline(StandardScaler(), PCA(n_components=3)) pca_pipe.fit(df) variables

$$Z = \frac{x-\overline{x}}{\sigma}$$

df_x = StandardScaler().fit_transform(df)
df_x

array([[-0.41978194,  0.28482986, -1.2879095 , ...,  0.44105193,
        -1.0755623 ,  0.15968566],
       [-0.41733926, -0.48772236, -0.59338101, ...,  0.44105193,
        -0.49243937, -0.10152429],
       [-0.41734159, -0.48772236, -0.59338101, ...,  0.39642699,
        -1.2087274 ,  1.32424667],
       ...,
       [-0.41344658, -0.48772236,  0.11573841, ...,  0.44105193,
        -0.98304761,  0.14880191],
       [-0.40776407, -0.48772236,  0.11573841, ...,  0.4032249 ,
        -0.86530163, -0.0579893 ],
       [-0.41500016, -0.48772236,  0.11573841, ...,  0.44105193,
        -0.66905833, -1.15724782]])

df_scaled = pd.DataFrame(data = df_x,
            columns = df.columns)

df_scaled.head()

modelo_pca = PCA()
scores = modelo_pca.fit_transform(df_x)

Análisis de Componentes Principales¶

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 7))
componentes = modelo_pca.components_
plt.imshow(componentes.T, cmap='viridis', aspect='auto')
plt.yticks(range(len(df.columns)), df.columns)
plt.xticks(range(len(df.columns[:14])), np.arange(modelo_pca.n_components_) + 1)
plt.grid(False)
plt.colorbar()
plt.show()

Por medio de este gráfico podemos por cada componente que relación tiene con cada variable

Varianza Explicada por Componente

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 4))
ax.bar(
    x      = np.arange(modelo_pca.n_components_) + 1,
    height = modelo_pca.explained_variance_ratio_
)

for x, y in zip(np.arange(len(df.columns)) + 1, modelo_pca.explained_variance_ratio_):
    label = round(y, 2)
    ax.annotate(
        label,
        (x,y),
        textcoords="offset points",
        xytext=(0,10),
        ha='center'
    )

ax.set_xticks(np.arange(modelo_pca.n_components_) + 1)
ax.set_ylim(0, 1.1)
ax.set_title('Porcentaje de varianza explicada por cada componente')
ax.set_xlabel('Componente principal')
ax.set_ylabel('Por. varianza explicada')
plt.show()

Varianza explicada Acumulada

prop_varianza_acum = modelo_pca.explained_variance_ratio_.cumsum()
print(prop_varianza_acum)

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(7, 4))
ax.plot(
    np.arange(len(df.columns[:14])) + 1,
    prop_varianza_acum,
    marker = 'o'
)

for x, y in zip(np.arange(len(df.columns[:14])) + 1, prop_varianza_acum):
    label = round(y, 2)
    ax.annotate(
        label,
        (x,y),
        textcoords="offset points",
        xytext=(0,10),
        ha='center'
    )
    
ax.set_ylim(0, 1.1)
ax.set_xticks(np.arange(modelo_pca.n_components_) + 1)
ax.set_title('Porcentaje de varianza explicada acumulada')
ax.set_xlabel('Componente principal')
ax.set_ylabel('Por. varianza acumulada')
plt.show()

[0.46757068 0.58539439 0.68174481 0.74506909 0.80584762 0.85299125
 0.89123488 0.92002606 0.9398306  0.95784899 0.9730483  0.98611883
 0.99569095 1.        ]

Resultados del PCA

resultados = modelo_pca.transform(df_x)

lista = []
for i in range(1,15,1):
    n = "PC"+str(i)
    lista.append(n)

df_resultados = pd.DataFrame(
    data    = resultados,
    columns = lista)

df_resultados.head()

Scatterplot PC1 & PC2

plt.figure(figsize=(12,8))
plt.scatter(df_resultados['PC1'],df_resultados['PC2'],cmap='plasma')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.show()

Scatterplot PC1 & PC2 con índices

plt.figure(figsize=(12,8))
plt.scatter(df_resultados['PC1'],df_resultados['PC2'],cmap='plasma',color="white")
plt.xlabel('PC 1')
plt.ylabel('PC 2')

for i, label in enumerate(range(0,df_resultados.shape[0],1)):
    plt.annotate(label, (df_resultados['PC1'][i], df_resultados['PC2'][i]))

plt.show()

Scatterplot en 3D

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10,10))
 
# choose projection 3d for creating a 3d graph
axis = fig.add_subplot(111, projection='3d')
 
# x[:,0]is pc1,x[:,1] is pc2 while x[:,2] is pc3
axis.scatter(df_resultados['PC1'],df_resultados['PC2'],df_resultados['PC3'],c = 'r',marker = 'o',cmap='plasma')
axis.set_xlabel("PC1", fontsize=10)
axis.set_ylabel("PC2", fontsize=10)
axis.set_zlabel("PC3", fontsize=10)

plt.show()

Loadings

df_Loadings = pd.DataFrame(modelo_pca.components_,columns=df.columns,index=df_Scores.columns)
df_Loadings.head()

font1 = {'family':'serif','color':'black','size':35}

def biplot(dfScores: pd.DataFrame, dfLoadings: pd.DataFrame) -> None:
    
    #create figure and axis objects
    fig,ax = plt.subplots(figsize=(15,8))
    
    #make a scores plot
    ax.scatter(dfScores.PC1.values,dfScores.PC2.values, color='y')
    #set x-axis label
    ax.set_xlabel("PC1",fontsize=10)
    #set y-axis label
    ax.set_ylabel("PC2",fontsize=10)
    
    #create a second set of axes
    ax2 = ax.twinx().twiny()
    
    #setup font dictionary
    font = {'color':  'g',
            'weight': 'bold',
            'size': 12,
            }
    
    #make a loadings plot
    for col in dfLoadings.columns.values:
        #where do our loading vectors end?
        tipx = dfLoadings.loc['PC1',col]
        tipy = dfLoadings.loc['PC2',col]
        #draw the vector, and write label text for col
        ax2.arrow(0, 0, tipx, tipy, color = 'r', alpha = 0.5)
        ax2.text(tipx*1.05, tipy*1.05, col, fontdict = font, ha = 'center', va = 'center')
    
    #align x = 0 of ax and ax2 with the center of figure
    mpl_axes_aligner.align.xaxes(ax, 0, ax2, 0, 0.5)
    #align y = 0 of ax and ax2 with the center of figure
    mpl_axes_aligner.align.yaxes(ax, 0, ax2, 0, 0.5)
    
    plt.axhline(0,color="black",linestyle = 'dashed')
    plt.axvline(0,color="black",linestyle = 'dashed')
    plt.title("BIPLOT PCA - BOSTON HOUSES",fontdict = font1)
    #show plot
    plt.show()

Biplot

biplot(df_resultados, df_Loadings)

	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
1	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
2	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
3	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
4	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
5	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	36.2

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
count	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000
mean	3.613524	11.363636	11.136779	0.069170	0.554695	6.284634	68.574901	3.795043	9.549407	408.237154	18.455534	356.674032	12.653063	22.532806
std	8.601545	23.322453	6.860353	0.253994	0.115878	0.702617	28.148861	2.105710	8.707259	168.537116	2.164946	91.294864	7.141062	9.197104
min	0.006320	0.000000	0.460000	0.000000	0.385000	3.561000	2.900000	1.129600	1.000000	187.000000	12.600000	0.320000	1.730000	5.000000
25%	0.082045	0.000000	5.190000	0.000000	0.449000	5.885500	45.025000	2.100175	4.000000	279.000000	17.400000	375.377500	6.950000	17.025000
50%	0.256510	0.000000	9.690000	0.000000	0.538000	6.208500	77.500000	3.207450	5.000000	330.000000	19.050000	391.440000	11.360000	21.200000
75%	3.677083	12.500000	18.100000	0.000000	0.624000	6.623500	94.075000	5.188425	24.000000	666.000000	20.200000	396.225000	16.955000	25.000000
max	88.976200	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	396.900000	37.970000	50.000000

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
crim	1.000000	-0.200000	0.410000	-0.060000	0.420000	-0.220000	0.350000	-0.380000	0.630000	0.580000	0.290000	-0.390000	0.460000	-0.390000
zn	-0.200000	1.000000	-0.530000	-0.040000	-0.520000	0.310000	-0.570000	0.660000	-0.310000	-0.310000	-0.390000	0.180000	-0.410000	0.360000
indus	0.410000	-0.530000	1.000000	0.060000	0.760000	-0.390000	0.640000	-0.710000	0.600000	0.720000	0.380000	-0.360000	0.600000	-0.480000
chas	-0.060000	-0.040000	0.060000	1.000000	0.090000	0.090000	0.090000	-0.100000	-0.010000	-0.040000	-0.120000	0.050000	-0.050000	0.180000
nox	0.420000	-0.520000	0.760000	0.090000	1.000000	-0.300000	0.730000	-0.770000	0.610000	0.670000	0.190000	-0.380000	0.590000	-0.430000
rm	-0.220000	0.310000	-0.390000	0.090000	-0.300000	1.000000	-0.240000	0.210000	-0.210000	-0.290000	-0.360000	0.130000	-0.610000	0.700000
age	0.350000	-0.570000	0.640000	0.090000	0.730000	-0.240000	1.000000	-0.750000	0.460000	0.510000	0.260000	-0.270000	0.600000	-0.380000
dis	-0.380000	0.660000	-0.710000	-0.100000	-0.770000	0.210000	-0.750000	1.000000	-0.490000	-0.530000	-0.230000	0.290000	-0.500000	0.250000
rad	0.630000	-0.310000	0.600000	-0.010000	0.610000	-0.210000	0.460000	-0.490000	1.000000	0.910000	0.460000	-0.440000	0.490000	-0.380000
tax	0.580000	-0.310000	0.720000	-0.040000	0.670000	-0.290000	0.510000	-0.530000	0.910000	1.000000	0.460000	-0.440000	0.540000	-0.470000
ptratio	0.290000	-0.390000	0.380000	-0.120000	0.190000	-0.360000	0.260000	-0.230000	0.460000	0.460000	1.000000	-0.180000	0.370000	-0.510000
black	-0.390000	0.180000	-0.360000	0.050000	-0.380000	0.130000	-0.270000	0.290000	-0.440000	-0.440000	-0.180000	1.000000	-0.370000	0.330000
lstat	0.460000	-0.410000	0.600000	-0.050000	0.590000	-0.610000	0.600000	-0.500000	0.490000	0.540000	0.370000	-0.370000	1.000000	-0.740000
medv	-0.390000	0.360000	-0.480000	0.180000	-0.430000	0.700000	-0.380000	0.250000	-0.380000	-0.470000	-0.510000	0.330000	-0.740000	1.000000

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
0	-0.419782	0.284830	-1.287909	-0.272599	-0.144217	0.413672	-0.120013	0.140214	-0.982843	-0.666608	-1.459000	0.441052	-1.075562	0.159686
1	-0.417339	-0.487722	-0.593381	-0.272599	-0.740262	0.194274	0.367166	0.557160	-0.867883	-0.987329	-0.303094	0.441052	-0.492439	-0.101524
2	-0.417342	-0.487722	-0.593381	-0.272599	-0.740262	1.282714	-0.265812	0.557160	-0.867883	-0.987329	-0.303094	0.396427	-1.208727	1.324247
3	-0.416750	-0.487722	-1.306878	-0.272599	-0.835284	1.016303	-0.809889	1.077737	-0.752922	-1.106115	0.113032	0.416163	-1.361517	1.182758
4	-0.412482	-0.487722	-1.306878	-0.272599	-0.835284	1.228577	-0.511180	1.077737	-0.752922	-1.106115	0.113032	0.441052	-1.026501	1.487503

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	PC11	PC12	PC13	PC14
0	-2.087344	0.492853	-0.335991	-0.028088	-1.012801	-0.262092	0.327878	0.160216	-0.471148	-0.205773	-0.780066	-0.109629	-0.494680	0.247931
1	-1.373382	-0.170924	-0.965964	-0.432406	-0.254645	0.303779	0.559110	-0.288650	-0.195830	-0.246239	-0.277543	0.588701	-0.113057	-0.113117
2	-2.376553	0.914027	-0.090026	-1.123913	0.032788	0.508900	0.487534	0.082490	0.054227	-0.195005	0.028905	0.416418	0.357030	0.051307
3	-2.837779	0.194870	0.060545	-1.065682	0.460334	0.714008	0.623329	0.239722	-0.358620	-0.155891	-0.244565	0.134699	0.577809	0.089625
4	-2.772916	0.433299	0.064042	-1.129637	0.382179	0.655855	0.704497	-0.102626	-0.408753	-0.000421	0.007758	0.221562	0.778435	0.148311

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
PC1	0.242284	-0.245435	0.331860	-0.005027	0.325194	-0.202817	0.296977	-0.298170	0.303413	0.324033	0.207680	-0.196638	0.311398	-0.266636
PC2	-0.065873	-0.148003	0.127076	0.410669	0.254276	0.434006	0.260303	-0.359150	0.031150	0.008851	-0.314623	0.026481	-0.201245	0.444924
PC3	0.395077	0.394546	-0.066082	-0.125305	-0.046476	0.353406	-0.200823	0.157069	0.418510	0.343232	0.000399	-0.361376	-0.161060	0.163189
PC4	0.100366	0.342958	-0.009627	0.700406	0.053708	-0.293357	-0.078426	0.184748	-0.051374	-0.026811	-0.342036	-0.201741	0.242621	-0.180298
PC5	-0.004958	-0.114495	0.022584	0.535198	-0.194606	0.008320	-0.149750	0.106219	0.230352	0.163426	0.615707	0.367461	-0.178359	0.050660