# importando modulos necesarios
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import pandas as pd
import seaborn as sns
from pydataset import data
# parametros esteticos de seaborn
"deep", desat=.6)
sns.set_palette(={"figure.figsize": (8, 4)}) sns.set_context(rc
datos cuantitativos en Python parte 1
= data('faithful')
faithful 10) faithful.head(
eruptions | waiting | |
---|---|---|
1 | 3.600 | 79 |
2 | 1.800 | 54 |
3 | 3.333 | 74 |
4 | 2.283 | 62 |
5 | 4.533 | 85 |
6 | 2.883 | 55 |
7 | 4.700 | 88 |
8 | 3.600 | 85 |
9 | 1.950 | 51 |
10 | 4.350 | 85 |
# histograma duración de erupciones con 8 barras
'eruptions'].hist(bins=8,color='red',figsize=(10,6))
faithful["Duración en minutos")
plt.xlabel("Frecuencia")
plt.ylabel( plt.show()
# Distribución de frecuencia.
# 1ro creamos un rango para las categorías.
= np.arange(1.5, 6., 0.5)
contenedores
# luego cortamos los datos en cada contenedor
= pd.cut(faithful['eruptions'], contenedores)
frec =frec.sort_values()
frec
frec# por último hacemos el recuento de los contenedores
# para armar la tabla de frecuencia.
= pd.value_counts(frec)
tabla_frec tabla_frec
(4.0, 4.5] 75
(1.5, 2.0] 55
(4.5, 5.0] 54
(2.0, 2.5] 37
(3.5, 4.0] 34
(3.0, 3.5] 9
(2.5, 3.0] 5
(5.0, 5.5] 3
Name: eruptions, dtype: int64
'eruptions'].min() faithful[
1.6
'eruptions'].max() faithful[
5.1
= list(np.arange(1.6-0.05, 5.1+1, 0.7))
bins
"erup_grup"] =pd.cut(faithful["eruptions"], bins=bins)
faithful['erup_grup']
faithful[= (faithful.groupby("erup_grup").agg(frecuencia=("eruptions", "count")))
erup_grup_counts erup_grup_counts
frecuencia | |
---|---|
erup_grup | |
(1.55, 2.25] | 79 |
(2.25, 2.95] | 18 |
(2.95, 3.65] | 15 |
(3.65, 4.35] | 77 |
(4.35, 5.05] | 81 |
(5.05, 5.75] | 2 |
"cum_frequency"] =erup_grup_counts["frecuencia"].cumsum()
erup_grup_counts[ erup_grup_counts
frecuencia | cum_frequency | |
---|---|---|
erup_grup | ||
(1.55, 2.25] | 79 | 79 |
(2.25, 2.95] | 18 | 97 |
(2.95, 3.65] | 15 | 112 |
(3.65, 4.35] | 77 | 189 |
(4.35, 5.05] | 81 | 270 |
(5.05, 5.75] | 2 | 272 |
"rel_frequency"] =erup_grup_counts["frecuencia"]/len(faithful)
erup_grup_counts[ erup_grup_counts
frecuencia | cum_frequency | rel_frequency | |
---|---|---|---|
erup_grup | |||
(1.55, 2.25] | 79 | 79 | 0.290441 |
(2.25, 2.95] | 18 | 97 | 0.066176 |
(2.95, 3.65] | 15 | 112 | 0.055147 |
(3.65, 4.35] | 77 | 189 | 0.283088 |
(4.35, 5.05] | 81 | 270 | 0.297794 |
(5.05, 5.75] | 2 | 272 | 0.007353 |
"relacum_frequency"] =erup_grup_counts["rel_frequency"].cumsum()
erup_grup_counts[ erup_grup_counts
frecuencia | cum_frequency | rel_frequency | relacum_frequency | |
---|---|---|---|---|
erup_grup | ||||
(1.55, 2.25] | 79 | 79 | 0.290441 | 0.290441 |
(2.25, 2.95] | 18 | 97 | 0.066176 | 0.356618 |
(2.95, 3.65] | 15 | 112 | 0.055147 | 0.411765 |
(3.65, 4.35] | 77 | 189 | 0.283088 | 0.694853 |
(4.35, 5.05] | 81 | 270 | 0.297794 | 0.992647 |
(5.05, 5.75] | 2 | 272 | 0.007353 | 1.000000 |
"frec_%"] =erup_grup_counts["rel_frequency"]*100
erup_grup_counts[ erup_grup_counts
frecuencia | cum_frequency | rel_frequency | relacum_frequency | frec_% | |
---|---|---|---|---|---|
erup_grup | |||||
(1.55, 2.25] | 79 | 79 | 0.290441 | 0.290441 | 29.044118 |
(2.25, 2.95] | 18 | 97 | 0.066176 | 0.356618 | 6.617647 |
(2.95, 3.65] | 15 | 112 | 0.055147 | 0.411765 | 5.514706 |
(3.65, 4.35] | 77 | 189 | 0.283088 | 0.694853 | 28.308824 |
(4.35, 5.05] | 81 | 270 | 0.297794 | 0.992647 | 29.779412 |
(5.05, 5.75] | 2 | 272 | 0.007353 | 1.000000 | 0.735294 |
"frec_%acum"] =erup_grup_counts["frec_%"].cumsum()
erup_grup_counts[ erup_grup_counts
frecuencia | cum_frequency | rel_frequency | relacum_frequency | frec_% | frec_%acum | |
---|---|---|---|---|---|---|
erup_grup | ||||||
(1.55, 2.25] | 79 | 79 | 0.290441 | 0.290441 | 29.044118 | 29.044118 |
(2.25, 2.95] | 18 | 97 | 0.066176 | 0.356618 | 6.617647 | 35.661765 |
(2.95, 3.65] | 15 | 112 | 0.055147 | 0.411765 | 5.514706 | 41.176471 |
(3.65, 4.35] | 77 | 189 | 0.283088 | 0.694853 | 28.308824 | 69.485294 |
(4.35, 5.05] | 81 | 270 | 0.297794 | 0.992647 | 29.779412 | 99.264706 |
(5.05, 5.75] | 2 | 272 | 0.007353 | 1.000000 | 0.735294 | 100.000000 |
#import pandas as pd
#tablafrec=pd.DataFrame({'fa':erup_grup_counts['frecuencia'],'faa':erup_grup_counts['cum_frequency'],'fr':erup_grup_counts['rel_frequency'],'fra':erup_grup_counts['rel_frequency'],'fporcen':erup_grup_counts['frec_%']},'fpa':erup_grup_counts['frec_%acum']})
#media=erup_grup_counts['frecuencia'].sum()
#media
# diagrama de dispersión
= faithful.plot(kind='scatter', x='eruptions', y='waiting') disp
# media de variable eruptions
'eruptions'].mean() faithful[
3.4877830882352936
# media geometrica
'eruptions']) stats.gmean(faithful[
3.2713131325361786
# media armónica
'eruptions']) stats.hmean(faithful[
3.0389330499472607
# mediana
'eruptions'].median() faithful[
4.0
# media truncada, recortando el 10 superior e inferior
'eruptions'], .10) stats.trim_mean(faithful[
3.5298073394495413
# moda
'eruptions'].mode() faithful[
0 1.867
1 4.500
Name: eruptions, dtype: float64
# varianza
'eruptions'].var() faithful[
1.302728332849468
# desvio estándar
'eruptions'].std() faithful[
1.141371251105208
# cuartiles
'eruptions'].quantile([.25, .5, .75]) faithful[
0.25 2.16275
0.50 4.00000
0.75 4.45425
Name: eruptions, dtype: float64
# diagrama de cajas
=sns.boxplot(list(faithful['eruptions'])) cajas
'eruptions'])
plt.boxplot(faithful[ plt.show()
# covarianza
faithful.cov()
eruptions | waiting | |
---|---|---|
eruptions | 1.302728 | 13.977808 |
waiting | 13.977808 | 184.823312 |
# coeficiente de correlación
faithful.corr()
eruptions | waiting | |
---|---|---|
eruptions | 1.000000 | 0.900811 |
waiting | 0.900811 | 1.000000 |
# resumen estadístico
'eruptions'].describe() faithful[
count 272.000000
mean 3.487783
std 1.141371
min 1.600000
25% 2.162750
50% 4.000000
75% 4.454250
max 5.100000
Name: eruptions, dtype: float64
= sns.pairplot(faithful) par
=faithful[faithful['eruptions']>2.55]
datos datos.head()
eruptions | waiting | erup_grup | |
---|---|---|---|
1 | 3.600 | 79 | (2.95, 3.65] |
3 | 3.333 | 74 | (2.95, 3.65] |
5 | 4.533 | 85 | (4.35, 5.05] |
6 | 2.883 | 55 | (2.25, 2.95] |
7 | 4.700 | 88 | (4.35, 5.05] |
# Gráfico
# ==============================================================================
= plt.subplots(1, 1, figsize=(6,4))
fig, ax =faithful.waiting, y=faithful.eruptions, alpha= 0.8)
ax.scatter(x'Altura')
ax.set_xlabel('Peso') ax.set_ylabel(
Text(0, 0.5, 'Peso')
import statsmodels.api as sm
# Gráfico distribución variables
# ==============================================================================
= plt.subplots(nrows=1, ncols=2, figsize=(10, 4))
fig, axs
0].hist(x=faithful.waiting, bins=5, color="#3182bd", alpha=0.5)
axs[0].plot(faithful.waiting, np.full_like(faithful.waiting, -0.01), '|k', markeredgewidth=1)
axs[0].set_title('Distribución (waiting)')
axs[0].set_xlabel('waiting')
axs[0].set_ylabel('counts')
axs[
1].hist(x=faithful.eruptions, bins=5, color="#3182bd", alpha=0.5)
axs[1].plot(faithful.eruptions, np.full_like(faithful.eruptions, -0.01), '|k', markeredgewidth=1)
axs[1].set_title('Distribución peso (eruptions)')
axs[1].set_xlabel('eruptions')
axs[1].set_ylabel('counts') axs[
Text(0, 0.5, 'counts')
# Gráfico Q-Q
# ==============================================================================
= plt.subplots(nrows=1, ncols=2, figsize=(10, 4))
fig, axs
sm.qqplot(
faithful.waiting,= True,
fit = 'q',
line = 0.4,
alpha = 2,
lw = axs[0]
ax
)0].set_title('Gráfico Q-Q height', fontsize = 10, fontweight = "bold")
axs[0].tick_params(labelsize = 7)
axs[
sm.qqplot(
faithful.eruptions,= True,
fit = 'q',
line = 0.4,
alpha = 2,
lw = axs[1]
ax
)1].set_title('Gráfico Q-Q height', fontsize = 10, fontweight = "bold")
axs[1].tick_params(labelsize = 7) axs[
import pingouin as pg
from scipy import stats
# Normalidad de los residuos Shapiro-Wilk test
# ==============================================================================
= stats.shapiro(faithful.waiting)
shapiro_test print(f"Variable height: {shapiro_test}")
= stats.shapiro(faithful.eruptions)
shapiro_test print(f"Variable weight: {shapiro_test}")
Variable height: ShapiroResult(statistic=0.922146737575531, pvalue=1.015239073365315e-10)
Variable weight: ShapiroResult(statistic=0.8459155559539795, pvalue=9.03598972142245e-16)
# Cálculo de correlación con Pandas
# ==============================================================================
print('Correlación Pearson: ', faithful['waiting'].corr(faithful['eruptions'], method='pearson'))
print('Correlación spearman: ', faithful['waiting'].corr(faithful['eruptions'], method='spearman'))
print('Correlación kendall: ', faithful['waiting'].corr(faithful['eruptions'], method='kendall'))
Correlación Pearson: 0.9008111683218127
Correlación spearman: 0.7779720576516121
Correlación kendall: 0.5747673538950212
# Datos
# ==============================================================================
= ('https://raw.githubusercontent.com/JoaquinAmatRodrigo/' +
url 'Estadistica-machine-learning-python/master/data/SaratogaHouses.csv')
= pd.read_csv(url, sep=",")
datos
# Se renombran las columnas para que sean más descriptivas
= ["precio", "metros_totales", "antiguedad", "precio_terreno",
datos.columns "metros_habitables", "universitarios", "dormitorios",
"chimenea", "banyos", "habitaciones", "calefaccion",
"consumo_calefacion", "desague", "vistas_lago",
"nueva_construccion", "aire_acondicionado"]
# Variables numéricas
= datos.select_dtypes(include=['float64', 'int']) datos
20) datos.head(
precio | metros_totales | antiguedad | precio_terreno | metros_habitables | universitarios | dormitorios | chimenea | banyos | habitaciones | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 132500 | 0.09 | 42 | 50000 | 906 | 35 | 2 | 1 | 1.0 | 5 |
1 | 181115 | 0.92 | 0 | 22300 | 1953 | 51 | 3 | 0 | 2.5 | 6 |
2 | 109000 | 0.19 | 133 | 7300 | 1944 | 51 | 4 | 1 | 1.0 | 8 |
3 | 155000 | 0.41 | 13 | 18700 | 1944 | 51 | 3 | 1 | 1.5 | 5 |
4 | 86060 | 0.11 | 0 | 15000 | 840 | 51 | 2 | 0 | 1.0 | 3 |
5 | 120000 | 0.68 | 31 | 14000 | 1152 | 22 | 4 | 1 | 1.0 | 8 |
6 | 153000 | 0.40 | 33 | 23300 | 2752 | 51 | 4 | 1 | 1.5 | 8 |
7 | 170000 | 1.21 | 23 | 14600 | 1662 | 35 | 4 | 1 | 1.5 | 9 |
8 | 90000 | 0.83 | 36 | 22200 | 1632 | 51 | 3 | 0 | 1.5 | 8 |
9 | 122900 | 1.94 | 4 | 21200 | 1416 | 44 | 3 | 0 | 1.5 | 6 |
10 | 325000 | 2.29 | 123 | 12600 | 2894 | 51 | 7 | 0 | 1.0 | 12 |
11 | 120000 | 0.92 | 1 | 22300 | 1624 | 51 | 3 | 0 | 2.0 | 6 |
12 | 85860 | 8.97 | 13 | 4800 | 704 | 41 | 2 | 0 | 1.0 | 4 |
13 | 97000 | 0.11 | 153 | 3100 | 1383 | 57 | 3 | 0 | 2.0 | 5 |
14 | 127000 | 0.14 | 9 | 300 | 1300 | 41 | 3 | 0 | 1.5 | 8 |
15 | 89900 | 0.00 | 88 | 2500 | 936 | 57 | 3 | 0 | 1.0 | 4 |
16 | 155000 | 0.13 | 9 | 300 | 1300 | 41 | 3 | 0 | 1.5 | 7 |
17 | 253750 | 2.00 | 0 | 49800 | 2816 | 71 | 4 | 1 | 2.5 | 12 |
18 | 60000 | 0.21 | 82 | 8500 | 924 | 35 | 2 | 0 | 1.0 | 6 |
19 | 87500 | 0.88 | 17 | 19400 | 1092 | 35 | 3 | 0 | 1.0 | 6 |
# Matriz de correlación
# ==============================================================================
= datos.corr(method='pearson')
corr_matrix corr_matrix
precio | metros_totales | antiguedad | precio_terreno | metros_habitables | universitarios | dormitorios | chimenea | banyos | habitaciones | |
---|---|---|---|---|---|---|---|---|---|---|
precio | 1.000000 | 0.158333 | -0.188793 | 0.581266 | 0.712390 | 0.200119 | 0.400349 | 0.376786 | 0.597250 | 0.531170 |
metros_totales | 0.158333 | 1.000000 | -0.016352 | 0.059222 | 0.163450 | -0.033148 | 0.113982 | 0.085226 | 0.084823 | 0.137604 |
antiguedad | -0.188793 | -0.016352 | 1.000000 | -0.021818 | -0.174242 | -0.037785 | 0.027125 | -0.172022 | -0.361897 | -0.082264 |
precio_terreno | 0.581266 | 0.059222 | -0.021818 | 1.000000 | 0.423441 | 0.228427 | 0.202449 | 0.211727 | 0.297498 | 0.298865 |
metros_habitables | 0.712390 | 0.163450 | -0.174242 | 0.423441 | 1.000000 | 0.209981 | 0.656196 | 0.473788 | 0.718564 | 0.733666 |
universitarios | 0.200119 | -0.033148 | -0.037785 | 0.228427 | 0.209981 | 1.000000 | 0.162919 | 0.246626 | 0.179541 | 0.157068 |
dormitorios | 0.400349 | 0.113982 | 0.027125 | 0.202449 | 0.656196 | 0.162919 | 1.000000 | 0.284475 | 0.458033 | 0.671863 |
chimenea | 0.376786 | 0.085226 | -0.172022 | 0.211727 | 0.473788 | 0.246626 | 0.284475 | 1.000000 | 0.436234 | 0.319894 |
banyos | 0.597250 | 0.084823 | -0.361897 | 0.297498 | 0.718564 | 0.179541 | 0.458033 | 0.436234 | 1.000000 | 0.517585 |
habitaciones | 0.531170 | 0.137604 | -0.082264 | 0.298865 | 0.733666 | 0.157068 | 0.671863 | 0.319894 | 0.517585 | 1.000000 |
# Heatmap matriz de correlaciones
# ==============================================================================
= plt.subplots(nrows=1, ncols=1, figsize=(5, 5))
fig, ax
sns.heatmap(
corr_matrix,= True,
annot = False,
cbar = {"size": 8},
annot_kws = -1,
vmin = 1,
vmax = 0,
center = sns.diverging_palette(20, 220, n=200),
cmap = True,
square = ax
ax
)
ax.set_xticklabels(
ax.get_xticklabels(),= 45,
rotation = 'right',
horizontalalignment
)
= 10) ax.tick_params(labelsize