datos cuantitativos en Python parte 1

# importando modulos necesarios


import matplotlib.pyplot as plt
import numpy as np 
from scipy import stats 
import pandas as pd 
import seaborn as sns 
from pydataset import data

# parametros esteticos de seaborn
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})
faithful = data('faithful')
faithful.head(10)
eruptions waiting
1 3.600 79
2 1.800 54
3 3.333 74
4 2.283 62
5 4.533 85
6 2.883 55
7 4.700 88
8 3.600 85
9 1.950 51
10 4.350 85
# histograma duración de erupciones con 8 barras
faithful['eruptions'].hist(bins=8,color='red',figsize=(10,6)) 
plt.xlabel("Duración en minutos")
plt.ylabel("Frecuencia")
plt.show()

# Distribución de frecuencia.
# 1ro creamos un rango para las categorías.
contenedores = np.arange(1.5, 6., 0.5)

# luego cortamos los datos en cada contenedor 
frec = pd.cut(faithful['eruptions'], contenedores)
frec=frec.sort_values()
frec
# por último hacemos el recuento de los contenedores
# para armar la tabla de frecuencia.
tabla_frec = pd.value_counts(frec)
tabla_frec
(4.0, 4.5]    75
(1.5, 2.0]    55
(4.5, 5.0]    54
(2.0, 2.5]    37
(3.5, 4.0]    34
(3.0, 3.5]     9
(2.5, 3.0]     5
(5.0, 5.5]     3
Name: eruptions, dtype: int64
faithful['eruptions'].min()
1.6
faithful['eruptions'].max()
5.1
bins = list(np.arange(1.6-0.05, 5.1+1, 0.7))

faithful["erup_grup"] =pd.cut(faithful["eruptions"], bins=bins)
faithful['erup_grup']
erup_grup_counts = (faithful.groupby("erup_grup").agg(frecuencia=("eruptions", "count")))
erup_grup_counts
frecuencia
erup_grup
(1.55, 2.25] 79
(2.25, 2.95] 18
(2.95, 3.65] 15
(3.65, 4.35] 77
(4.35, 5.05] 81
(5.05, 5.75] 2
erup_grup_counts["cum_frequency"] =erup_grup_counts["frecuencia"].cumsum()
erup_grup_counts
frecuencia cum_frequency
erup_grup
(1.55, 2.25] 79 79
(2.25, 2.95] 18 97
(2.95, 3.65] 15 112
(3.65, 4.35] 77 189
(4.35, 5.05] 81 270
(5.05, 5.75] 2 272
erup_grup_counts["rel_frequency"] =erup_grup_counts["frecuencia"]/len(faithful)
erup_grup_counts
frecuencia cum_frequency rel_frequency
erup_grup
(1.55, 2.25] 79 79 0.290441
(2.25, 2.95] 18 97 0.066176
(2.95, 3.65] 15 112 0.055147
(3.65, 4.35] 77 189 0.283088
(4.35, 5.05] 81 270 0.297794
(5.05, 5.75] 2 272 0.007353
erup_grup_counts["relacum_frequency"] =erup_grup_counts["rel_frequency"].cumsum()
erup_grup_counts
frecuencia cum_frequency rel_frequency relacum_frequency
erup_grup
(1.55, 2.25] 79 79 0.290441 0.290441
(2.25, 2.95] 18 97 0.066176 0.356618
(2.95, 3.65] 15 112 0.055147 0.411765
(3.65, 4.35] 77 189 0.283088 0.694853
(4.35, 5.05] 81 270 0.297794 0.992647
(5.05, 5.75] 2 272 0.007353 1.000000
erup_grup_counts["frec_%"] =erup_grup_counts["rel_frequency"]*100
erup_grup_counts
frecuencia cum_frequency rel_frequency relacum_frequency frec_%
erup_grup
(1.55, 2.25] 79 79 0.290441 0.290441 29.044118
(2.25, 2.95] 18 97 0.066176 0.356618 6.617647
(2.95, 3.65] 15 112 0.055147 0.411765 5.514706
(3.65, 4.35] 77 189 0.283088 0.694853 28.308824
(4.35, 5.05] 81 270 0.297794 0.992647 29.779412
(5.05, 5.75] 2 272 0.007353 1.000000 0.735294
erup_grup_counts["frec_%acum"] =erup_grup_counts["frec_%"].cumsum()
erup_grup_counts
frecuencia cum_frequency rel_frequency relacum_frequency frec_% frec_%acum
erup_grup
(1.55, 2.25] 79 79 0.290441 0.290441 29.044118 29.044118
(2.25, 2.95] 18 97 0.066176 0.356618 6.617647 35.661765
(2.95, 3.65] 15 112 0.055147 0.411765 5.514706 41.176471
(3.65, 4.35] 77 189 0.283088 0.694853 28.308824 69.485294
(4.35, 5.05] 81 270 0.297794 0.992647 29.779412 99.264706
(5.05, 5.75] 2 272 0.007353 1.000000 0.735294 100.000000
#import pandas as pd
#tablafrec=pd.DataFrame({'fa':erup_grup_counts['frecuencia'],'faa':erup_grup_counts['cum_frequency'],'fr':erup_grup_counts['rel_frequency'],'fra':erup_grup_counts['rel_frequency'],'fporcen':erup_grup_counts['frec_%']},'fpa':erup_grup_counts['frec_%acum']})
#media=erup_grup_counts['frecuencia'].sum()
#media
# diagrama de dispersión
disp= faithful.plot(kind='scatter', x='eruptions', y='waiting')

# media de variable eruptions
faithful['eruptions'].mean()
3.4877830882352936
# media geometrica
stats.gmean(faithful['eruptions'])
3.2713131325361786
# media armónica
stats.hmean(faithful['eruptions'])
3.0389330499472607
# mediana
faithful['eruptions'].median()
4.0
# media truncada, recortando el 10 superior e inferior
stats.trim_mean(faithful['eruptions'], .10)
3.5298073394495413
# moda
faithful['eruptions'].mode()
0    1.867
1    4.500
Name: eruptions, dtype: float64
# varianza
faithful['eruptions'].var()
1.302728332849468
# desvio estándar
faithful['eruptions'].std()
1.141371251105208
# cuartiles
faithful['eruptions'].quantile([.25, .5, .75])
0.25    2.16275
0.50    4.00000
0.75    4.45425
Name: eruptions, dtype: float64
# diagrama de cajas
cajas=sns.boxplot(list(faithful['eruptions']))

plt.boxplot(faithful['eruptions'])
plt.show()

# covarianza
faithful.cov()
eruptions waiting
eruptions 1.302728 13.977808
waiting 13.977808 184.823312
# coeficiente de correlación
faithful.corr()
eruptions waiting
eruptions 1.000000 0.900811
waiting 0.900811 1.000000
# resumen estadístico
faithful['eruptions'].describe()
count    272.000000
mean       3.487783
std        1.141371
min        1.600000
25%        2.162750
50%        4.000000
75%        4.454250
max        5.100000
Name: eruptions, dtype: float64
par= sns.pairplot(faithful)

datos=faithful[faithful['eruptions']>2.55]
datos.head()
eruptions waiting erup_grup
1 3.600 79 (2.95, 3.65]
3 3.333 74 (2.95, 3.65]
5 4.533 85 (4.35, 5.05]
6 2.883 55 (2.25, 2.95]
7 4.700 88 (4.35, 5.05]
# Gráfico
# ==============================================================================
fig, ax = plt.subplots(1, 1, figsize=(6,4))
ax.scatter(x=faithful.waiting, y=faithful.eruptions, alpha= 0.8)
ax.set_xlabel('Altura')
ax.set_ylabel('Peso')
Text(0, 0.5, 'Peso')

import statsmodels.api as sm
# Gráfico distribución variables
# ==============================================================================
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

axs[0].hist(x=faithful.waiting, bins=5, color="#3182bd", alpha=0.5)
axs[0].plot(faithful.waiting, np.full_like(faithful.waiting, -0.01), '|k', markeredgewidth=1)
axs[0].set_title('Distribución  (waiting)')
axs[0].set_xlabel('waiting')
axs[0].set_ylabel('counts')

axs[1].hist(x=faithful.eruptions, bins=5, color="#3182bd", alpha=0.5)
axs[1].plot(faithful.eruptions, np.full_like(faithful.eruptions, -0.01), '|k', markeredgewidth=1)
axs[1].set_title('Distribución peso (eruptions)')
axs[1].set_xlabel('eruptions')
axs[1].set_ylabel('counts')
Text(0, 0.5, 'counts')

# Gráfico Q-Q
# ==============================================================================
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

sm.qqplot(
    faithful.waiting,
    fit   = True,
    line  = 'q',
    alpha = 0.4,
    lw    = 2,
    ax    = axs[0]
)
axs[0].set_title('Gráfico Q-Q height', fontsize = 10, fontweight = "bold")
axs[0].tick_params(labelsize = 7)

sm.qqplot(
    faithful.eruptions,
    fit   = True,
    line  = 'q',
    alpha = 0.4,
    lw    = 2,
    ax    = axs[1]
)
axs[1].set_title('Gráfico Q-Q height', fontsize = 10, fontweight = "bold")
axs[1].tick_params(labelsize = 7)

import pingouin as pg
from scipy import stats
# Normalidad de los residuos Shapiro-Wilk test
# ==============================================================================
shapiro_test = stats.shapiro(faithful.waiting)
print(f"Variable height: {shapiro_test}")
shapiro_test = stats.shapiro(faithful.eruptions)
print(f"Variable weight: {shapiro_test}")
Variable height: ShapiroResult(statistic=0.922146737575531, pvalue=1.015239073365315e-10)
Variable weight: ShapiroResult(statistic=0.8459155559539795, pvalue=9.03598972142245e-16)
# Cálculo de correlación con Pandas
# ==============================================================================
print('Correlación Pearson: ', faithful['waiting'].corr(faithful['eruptions'], method='pearson'))
print('Correlación spearman: ', faithful['waiting'].corr(faithful['eruptions'], method='spearman'))
print('Correlación kendall: ', faithful['waiting'].corr(faithful['eruptions'], method='kendall'))
Correlación Pearson:  0.9008111683218127
Correlación spearman:  0.7779720576516121
Correlación kendall:  0.5747673538950212
# Datos
# ==============================================================================
url = ('https://raw.githubusercontent.com/JoaquinAmatRodrigo/' +
       'Estadistica-machine-learning-python/master/data/SaratogaHouses.csv')
datos = pd.read_csv(url, sep=",")

# Se renombran las columnas para que sean más descriptivas
datos.columns = ["precio", "metros_totales", "antiguedad", "precio_terreno",
                 "metros_habitables", "universitarios", "dormitorios", 
                 "chimenea", "banyos", "habitaciones", "calefaccion",
                 "consumo_calefacion", "desague", "vistas_lago",
                 "nueva_construccion", "aire_acondicionado"]
       
# Variables numéricas
datos = datos.select_dtypes(include=['float64', 'int'])
datos.head(20)
precio metros_totales antiguedad precio_terreno metros_habitables universitarios dormitorios chimenea banyos habitaciones
0 132500 0.09 42 50000 906 35 2 1 1.0 5
1 181115 0.92 0 22300 1953 51 3 0 2.5 6
2 109000 0.19 133 7300 1944 51 4 1 1.0 8
3 155000 0.41 13 18700 1944 51 3 1 1.5 5
4 86060 0.11 0 15000 840 51 2 0 1.0 3
5 120000 0.68 31 14000 1152 22 4 1 1.0 8
6 153000 0.40 33 23300 2752 51 4 1 1.5 8
7 170000 1.21 23 14600 1662 35 4 1 1.5 9
8 90000 0.83 36 22200 1632 51 3 0 1.5 8
9 122900 1.94 4 21200 1416 44 3 0 1.5 6
10 325000 2.29 123 12600 2894 51 7 0 1.0 12
11 120000 0.92 1 22300 1624 51 3 0 2.0 6
12 85860 8.97 13 4800 704 41 2 0 1.0 4
13 97000 0.11 153 3100 1383 57 3 0 2.0 5
14 127000 0.14 9 300 1300 41 3 0 1.5 8
15 89900 0.00 88 2500 936 57 3 0 1.0 4
16 155000 0.13 9 300 1300 41 3 0 1.5 7
17 253750 2.00 0 49800 2816 71 4 1 2.5 12
18 60000 0.21 82 8500 924 35 2 0 1.0 6
19 87500 0.88 17 19400 1092 35 3 0 1.0 6
# Matriz de correlación
# ==============================================================================
corr_matrix = datos.corr(method='pearson')
corr_matrix
precio metros_totales antiguedad precio_terreno metros_habitables universitarios dormitorios chimenea banyos habitaciones
precio 1.000000 0.158333 -0.188793 0.581266 0.712390 0.200119 0.400349 0.376786 0.597250 0.531170
metros_totales 0.158333 1.000000 -0.016352 0.059222 0.163450 -0.033148 0.113982 0.085226 0.084823 0.137604
antiguedad -0.188793 -0.016352 1.000000 -0.021818 -0.174242 -0.037785 0.027125 -0.172022 -0.361897 -0.082264
precio_terreno 0.581266 0.059222 -0.021818 1.000000 0.423441 0.228427 0.202449 0.211727 0.297498 0.298865
metros_habitables 0.712390 0.163450 -0.174242 0.423441 1.000000 0.209981 0.656196 0.473788 0.718564 0.733666
universitarios 0.200119 -0.033148 -0.037785 0.228427 0.209981 1.000000 0.162919 0.246626 0.179541 0.157068
dormitorios 0.400349 0.113982 0.027125 0.202449 0.656196 0.162919 1.000000 0.284475 0.458033 0.671863
chimenea 0.376786 0.085226 -0.172022 0.211727 0.473788 0.246626 0.284475 1.000000 0.436234 0.319894
banyos 0.597250 0.084823 -0.361897 0.297498 0.718564 0.179541 0.458033 0.436234 1.000000 0.517585
habitaciones 0.531170 0.137604 -0.082264 0.298865 0.733666 0.157068 0.671863 0.319894 0.517585 1.000000
# Heatmap matriz de correlaciones
# ==============================================================================
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(5, 5))

sns.heatmap(
    corr_matrix,
    annot     = True,
    cbar      = False,
    annot_kws = {"size": 8},
    vmin      = -1,
    vmax      = 1,
    center    = 0,
    cmap      = sns.diverging_palette(20, 220, n=200),
    square    = True,
    ax        = ax
)

ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation = 45,
    horizontalalignment = 'right',
)

ax.tick_params(labelsize = 10)