datos cuantitativos en Python parte 1

# importando modulos necesarios


import matplotlib.pyplot as plt
import numpy as np 
from scipy import stats 
import pandas as pd 
import seaborn as sns 
from pydataset import data

# parametros esteticos de seaborn
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})

faithful = data('faithful')
faithful.head(10)

	eruptions	waiting
1	3.600	79
2	1.800	54
3	3.333	74
4	2.283	62
5	4.533	85
6	2.883	55
7	4.700	88
8	3.600	85
9	1.950	51
10	4.350	85

# histograma duración de erupciones con 8 barras
faithful['eruptions'].hist(bins=8,color='red',figsize=(10,6)) 
plt.xlabel("Duración en minutos")
plt.ylabel("Frecuencia")
plt.show()

# Distribución de frecuencia.
# 1ro creamos un rango para las categorías.
contenedores = np.arange(1.5, 6., 0.5)

# luego cortamos los datos en cada contenedor 
frec = pd.cut(faithful['eruptions'], contenedores)
frec=frec.sort_values()
frec
# por último hacemos el recuento de los contenedores
# para armar la tabla de frecuencia.
tabla_frec = pd.value_counts(frec)
tabla_frec

(4.0, 4.5]    75
(1.5, 2.0]    55
(4.5, 5.0]    54
(2.0, 2.5]    37
(3.5, 4.0]    34
(3.0, 3.5]     9
(2.5, 3.0]     5
(5.0, 5.5]     3
Name: eruptions, dtype: int64

faithful['eruptions'].min()

1.6

faithful['eruptions'].max()

5.1

bins = list(np.arange(1.6-0.05, 5.1+1, 0.7))

faithful["erup_grup"] =pd.cut(faithful["eruptions"], bins=bins)
faithful['erup_grup']
erup_grup_counts = (faithful.groupby("erup_grup").agg(frecuencia=("eruptions", "count")))
erup_grup_counts

	frecuencia
erup_grup
(1.55, 2.25]	79
(2.25, 2.95]	18
(2.95, 3.65]	15
(3.65, 4.35]	77
(4.35, 5.05]	81
(5.05, 5.75]	2

erup_grup_counts["cum_frequency"] =erup_grup_counts["frecuencia"].cumsum()
erup_grup_counts

	frecuencia	cum_frequency
erup_grup
(1.55, 2.25]	79	79
(2.25, 2.95]	18	97
(2.95, 3.65]	15	112
(3.65, 4.35]	77	189
(4.35, 5.05]	81	270
(5.05, 5.75]	2	272

erup_grup_counts["rel_frequency"] =erup_grup_counts["frecuencia"]/len(faithful)
erup_grup_counts

	frecuencia	cum_frequency	rel_frequency
erup_grup
(1.55, 2.25]	79	79	0.290441
(2.25, 2.95]	18	97	0.066176
(2.95, 3.65]	15	112	0.055147
(3.65, 4.35]	77	189	0.283088
(4.35, 5.05]	81	270	0.297794
(5.05, 5.75]	2	272	0.007353

erup_grup_counts["relacum_frequency"] =erup_grup_counts["rel_frequency"].cumsum()
erup_grup_counts

	frecuencia	cum_frequency	rel_frequency	relacum_frequency
erup_grup
(1.55, 2.25]	79	79	0.290441	0.290441
(2.25, 2.95]	18	97	0.066176	0.356618
(2.95, 3.65]	15	112	0.055147	0.411765
(3.65, 4.35]	77	189	0.283088	0.694853
(4.35, 5.05]	81	270	0.297794	0.992647
(5.05, 5.75]	2	272	0.007353	1.000000

erup_grup_counts["frec_%"] =erup_grup_counts["rel_frequency"]*100
erup_grup_counts

	frecuencia	cum_frequency	rel_frequency	relacum_frequency	frec_%
erup_grup
(1.55, 2.25]	79	79	0.290441	0.290441	29.044118
(2.25, 2.95]	18	97	0.066176	0.356618	6.617647
(2.95, 3.65]	15	112	0.055147	0.411765	5.514706
(3.65, 4.35]	77	189	0.283088	0.694853	28.308824
(4.35, 5.05]	81	270	0.297794	0.992647	29.779412
(5.05, 5.75]	2	272	0.007353	1.000000	0.735294

erup_grup_counts["frec_%acum"] =erup_grup_counts["frec_%"].cumsum()
erup_grup_counts

	frecuencia	cum_frequency	rel_frequency	relacum_frequency	frec_%	frec_%acum
erup_grup
(1.55, 2.25]	79	79	0.290441	0.290441	29.044118	29.044118
(2.25, 2.95]	18	97	0.066176	0.356618	6.617647	35.661765
(2.95, 3.65]	15	112	0.055147	0.411765	5.514706	41.176471
(3.65, 4.35]	77	189	0.283088	0.694853	28.308824	69.485294
(4.35, 5.05]	81	270	0.297794	0.992647	29.779412	99.264706
(5.05, 5.75]	2	272	0.007353	1.000000	0.735294	100.000000

#import pandas as pd
#tablafrec=pd.DataFrame({'fa':erup_grup_counts['frecuencia'],'faa':erup_grup_counts['cum_frequency'],'fr':erup_grup_counts['rel_frequency'],'fra':erup_grup_counts['rel_frequency'],'fporcen':erup_grup_counts['frec_%']},'fpa':erup_grup_counts['frec_%acum']})
#media=erup_grup_counts['frecuencia'].sum()
#media

# diagrama de dispersión
disp= faithful.plot(kind='scatter', x='eruptions', y='waiting')

# media de variable eruptions
faithful['eruptions'].mean()

3.4877830882352936

# media geometrica
stats.gmean(faithful['eruptions'])

3.2713131325361786

# media armónica
stats.hmean(faithful['eruptions'])

3.0389330499472607

# mediana
faithful['eruptions'].median()

4.0

# media truncada, recortando el 10 superior e inferior
stats.trim_mean(faithful['eruptions'], .10)

3.5298073394495413

# moda
faithful['eruptions'].mode()

0    1.867
1    4.500
Name: eruptions, dtype: float64

# varianza
faithful['eruptions'].var()

1.302728332849468

# desvio estándar
faithful['eruptions'].std()

1.141371251105208

# cuartiles
faithful['eruptions'].quantile([.25, .5, .75])

0.25    2.16275
0.50    4.00000
0.75    4.45425
Name: eruptions, dtype: float64

# diagrama de cajas
cajas=sns.boxplot(list(faithful['eruptions']))

plt.boxplot(faithful['eruptions'])
plt.show()

# covarianza
faithful.cov()

	eruptions	waiting
eruptions	1.302728	13.977808
waiting	13.977808	184.823312

# coeficiente de correlación
faithful.corr()

	eruptions	waiting
eruptions	1.000000	0.900811
waiting	0.900811	1.000000

# resumen estadístico
faithful['eruptions'].describe()

count    272.000000
mean       3.487783
std        1.141371
min        1.600000
25%        2.162750
50%        4.000000
75%        4.454250
max        5.100000
Name: eruptions, dtype: float64

par= sns.pairplot(faithful)

datos=faithful[faithful['eruptions']>2.55]
datos.head()

	eruptions	waiting	erup_grup
1	3.600	79	(2.95, 3.65]
3	3.333	74	(2.95, 3.65]
5	4.533	85	(4.35, 5.05]
6	2.883	55	(2.25, 2.95]
7	4.700	88	(4.35, 5.05]

# Gráfico
# ==============================================================================
fig, ax = plt.subplots(1, 1, figsize=(6,4))
ax.scatter(x=faithful.waiting, y=faithful.eruptions, alpha= 0.8)
ax.set_xlabel('Altura')
ax.set_ylabel('Peso')

Text(0, 0.5, 'Peso')

import statsmodels.api as sm
# Gráfico distribución variables
# ==============================================================================
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

axs[0].hist(x=faithful.waiting, bins=5, color="#3182bd", alpha=0.5)
axs[0].plot(faithful.waiting, np.full_like(faithful.waiting, -0.01), '|k', markeredgewidth=1)
axs[0].set_title('Distribución  (waiting)')
axs[0].set_xlabel('waiting')
axs[0].set_ylabel('counts')

axs[1].hist(x=faithful.eruptions, bins=5, color="#3182bd", alpha=0.5)
axs[1].plot(faithful.eruptions, np.full_like(faithful.eruptions, -0.01), '|k', markeredgewidth=1)
axs[1].set_title('Distribución peso (eruptions)')
axs[1].set_xlabel('eruptions')
axs[1].set_ylabel('counts')

Text(0, 0.5, 'counts')

# Gráfico Q-Q
# ==============================================================================
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

sm.qqplot(
    faithful.waiting,
    fit   = True,
    line  = 'q',
    alpha = 0.4,
    lw    = 2,
    ax    = axs[0]
)
axs[0].set_title('Gráfico Q-Q height', fontsize = 10, fontweight = "bold")
axs[0].tick_params(labelsize = 7)

sm.qqplot(
    faithful.eruptions,
    fit   = True,
    line  = 'q',
    alpha = 0.4,
    lw    = 2,
    ax    = axs[1]
)
axs[1].set_title('Gráfico Q-Q height', fontsize = 10, fontweight = "bold")
axs[1].tick_params(labelsize = 7)

import pingouin as pg
from scipy import stats
# Normalidad de los residuos Shapiro-Wilk test
# ==============================================================================
shapiro_test = stats.shapiro(faithful.waiting)
print(f"Variable height: {shapiro_test}")
shapiro_test = stats.shapiro(faithful.eruptions)
print(f"Variable weight: {shapiro_test}")

Variable height: ShapiroResult(statistic=0.922146737575531, pvalue=1.015239073365315e-10)
Variable weight: ShapiroResult(statistic=0.8459155559539795, pvalue=9.03598972142245e-16)

# Cálculo de correlación con Pandas
# ==============================================================================
print('Correlación Pearson: ', faithful['waiting'].corr(faithful['eruptions'], method='pearson'))
print('Correlación spearman: ', faithful['waiting'].corr(faithful['eruptions'], method='spearman'))
print('Correlación kendall: ', faithful['waiting'].corr(faithful['eruptions'], method='kendall'))

Correlación Pearson:  0.9008111683218127
Correlación spearman:  0.7779720576516121
Correlación kendall:  0.5747673538950212

# Datos
# ==============================================================================
url = ('https://raw.githubusercontent.com/JoaquinAmatRodrigo/' +
       'Estadistica-machine-learning-python/master/data/SaratogaHouses.csv')
datos = pd.read_csv(url, sep=",")

# Se renombran las columnas para que sean más descriptivas
datos.columns = ["precio", "metros_totales", "antiguedad", "precio_terreno",
                 "metros_habitables", "universitarios", "dormitorios", 
                 "chimenea", "banyos", "habitaciones", "calefaccion",
                 "consumo_calefacion", "desague", "vistas_lago",
                 "nueva_construccion", "aire_acondicionado"]
       
# Variables numéricas
datos = datos.select_dtypes(include=['float64', 'int'])

datos.head(20)

	precio	metros_totales	antiguedad	precio_terreno	metros_habitables	universitarios	dormitorios	chimenea	banyos	habitaciones
0	132500	0.09	42	50000	906	35	2	1	1.0	5
1	181115	0.92	0	22300	1953	51	3	0	2.5	6
2	109000	0.19	133	7300	1944	51	4	1	1.0	8
3	155000	0.41	13	18700	1944	51	3	1	1.5	5
4	86060	0.11	0	15000	840	51	2	0	1.0	3
5	120000	0.68	31	14000	1152	22	4	1	1.0	8
6	153000	0.40	33	23300	2752	51	4	1	1.5	8
7	170000	1.21	23	14600	1662	35	4	1	1.5	9
8	90000	0.83	36	22200	1632	51	3	0	1.5	8
9	122900	1.94	4	21200	1416	44	3	0	1.5	6
10	325000	2.29	123	12600	2894	51	7	0	1.0	12
11	120000	0.92	1	22300	1624	51	3	0	2.0	6
12	85860	8.97	13	4800	704	41	2	0	1.0	4
13	97000	0.11	153	3100	1383	57	3	0	2.0	5
14	127000	0.14	9	300	1300	41	3	0	1.5	8
15	89900	0.00	88	2500	936	57	3	0	1.0	4
16	155000	0.13	9	300	1300	41	3	0	1.5	7
17	253750	2.00	0	49800	2816	71	4	1	2.5	12
18	60000	0.21	82	8500	924	35	2	0	1.0	6
19	87500	0.88	17	19400	1092	35	3	0	1.0	6

# Matriz de correlación
# ==============================================================================
corr_matrix = datos.corr(method='pearson')
corr_matrix

	precio	metros_totales	antiguedad	precio_terreno	metros_habitables	universitarios	dormitorios	chimenea	banyos	habitaciones
precio	1.000000	0.158333	-0.188793	0.581266	0.712390	0.200119	0.400349	0.376786	0.597250	0.531170
metros_totales	0.158333	1.000000	-0.016352	0.059222	0.163450	-0.033148	0.113982	0.085226	0.084823	0.137604
antiguedad	-0.188793	-0.016352	1.000000	-0.021818	-0.174242	-0.037785	0.027125	-0.172022	-0.361897	-0.082264
precio_terreno	0.581266	0.059222	-0.021818	1.000000	0.423441	0.228427	0.202449	0.211727	0.297498	0.298865
metros_habitables	0.712390	0.163450	-0.174242	0.423441	1.000000	0.209981	0.656196	0.473788	0.718564	0.733666
universitarios	0.200119	-0.033148	-0.037785	0.228427	0.209981	1.000000	0.162919	0.246626	0.179541	0.157068
dormitorios	0.400349	0.113982	0.027125	0.202449	0.656196	0.162919	1.000000	0.284475	0.458033	0.671863
chimenea	0.376786	0.085226	-0.172022	0.211727	0.473788	0.246626	0.284475	1.000000	0.436234	0.319894
banyos	0.597250	0.084823	-0.361897	0.297498	0.718564	0.179541	0.458033	0.436234	1.000000	0.517585
habitaciones	0.531170	0.137604	-0.082264	0.298865	0.733666	0.157068	0.671863	0.319894	0.517585	1.000000

# Heatmap matriz de correlaciones
# ==============================================================================
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(5, 5))

sns.heatmap(
    corr_matrix,
    annot     = True,
    cbar      = False,
    annot_kws = {"size": 8},
    vmin      = -1,
    vmax      = 1,
    center    = 0,
    cmap      = sns.diverging_palette(20, 220, n=200),
    square    = True,
    ax        = ax
)

ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation = 45,
    horizontalalignment = 'right',
)

ax.tick_params(labelsize = 10)