# importando modulos necesarios
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import pandas as pd
import seaborn as sns
from pydataset import data
# parametros esteticos de seaborn
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})datos cuantitativos en Python parte 1
faithful = data('faithful')
faithful.head(10)| eruptions | waiting | |
|---|---|---|
| 1 | 3.600 | 79 |
| 2 | 1.800 | 54 |
| 3 | 3.333 | 74 |
| 4 | 2.283 | 62 |
| 5 | 4.533 | 85 |
| 6 | 2.883 | 55 |
| 7 | 4.700 | 88 |
| 8 | 3.600 | 85 |
| 9 | 1.950 | 51 |
| 10 | 4.350 | 85 |
# histograma duración de erupciones con 8 barras
faithful['eruptions'].hist(bins=8,color='red',figsize=(10,6))
plt.xlabel("Duración en minutos")
plt.ylabel("Frecuencia")
plt.show()# Distribución de frecuencia.
# 1ro creamos un rango para las categorías.
contenedores = np.arange(1.5, 6., 0.5)
# luego cortamos los datos en cada contenedor
frec = pd.cut(faithful['eruptions'], contenedores)
frec=frec.sort_values()
frec
# por último hacemos el recuento de los contenedores
# para armar la tabla de frecuencia.
tabla_frec = pd.value_counts(frec)
tabla_frec(4.0, 4.5] 75
(1.5, 2.0] 55
(4.5, 5.0] 54
(2.0, 2.5] 37
(3.5, 4.0] 34
(3.0, 3.5] 9
(2.5, 3.0] 5
(5.0, 5.5] 3
Name: eruptions, dtype: int64
faithful['eruptions'].min()1.6
faithful['eruptions'].max()5.1
bins = list(np.arange(1.6-0.05, 5.1+1, 0.7))
faithful["erup_grup"] =pd.cut(faithful["eruptions"], bins=bins)
faithful['erup_grup']
erup_grup_counts = (faithful.groupby("erup_grup").agg(frecuencia=("eruptions", "count")))
erup_grup_counts| frecuencia | |
|---|---|
| erup_grup | |
| (1.55, 2.25] | 79 |
| (2.25, 2.95] | 18 |
| (2.95, 3.65] | 15 |
| (3.65, 4.35] | 77 |
| (4.35, 5.05] | 81 |
| (5.05, 5.75] | 2 |
erup_grup_counts["cum_frequency"] =erup_grup_counts["frecuencia"].cumsum()
erup_grup_counts| frecuencia | cum_frequency | |
|---|---|---|
| erup_grup | ||
| (1.55, 2.25] | 79 | 79 |
| (2.25, 2.95] | 18 | 97 |
| (2.95, 3.65] | 15 | 112 |
| (3.65, 4.35] | 77 | 189 |
| (4.35, 5.05] | 81 | 270 |
| (5.05, 5.75] | 2 | 272 |
erup_grup_counts["rel_frequency"] =erup_grup_counts["frecuencia"]/len(faithful)
erup_grup_counts| frecuencia | cum_frequency | rel_frequency | |
|---|---|---|---|
| erup_grup | |||
| (1.55, 2.25] | 79 | 79 | 0.290441 |
| (2.25, 2.95] | 18 | 97 | 0.066176 |
| (2.95, 3.65] | 15 | 112 | 0.055147 |
| (3.65, 4.35] | 77 | 189 | 0.283088 |
| (4.35, 5.05] | 81 | 270 | 0.297794 |
| (5.05, 5.75] | 2 | 272 | 0.007353 |
erup_grup_counts["relacum_frequency"] =erup_grup_counts["rel_frequency"].cumsum()
erup_grup_counts| frecuencia | cum_frequency | rel_frequency | relacum_frequency | |
|---|---|---|---|---|
| erup_grup | ||||
| (1.55, 2.25] | 79 | 79 | 0.290441 | 0.290441 |
| (2.25, 2.95] | 18 | 97 | 0.066176 | 0.356618 |
| (2.95, 3.65] | 15 | 112 | 0.055147 | 0.411765 |
| (3.65, 4.35] | 77 | 189 | 0.283088 | 0.694853 |
| (4.35, 5.05] | 81 | 270 | 0.297794 | 0.992647 |
| (5.05, 5.75] | 2 | 272 | 0.007353 | 1.000000 |
erup_grup_counts["frec_%"] =erup_grup_counts["rel_frequency"]*100
erup_grup_counts| frecuencia | cum_frequency | rel_frequency | relacum_frequency | frec_% | |
|---|---|---|---|---|---|
| erup_grup | |||||
| (1.55, 2.25] | 79 | 79 | 0.290441 | 0.290441 | 29.044118 |
| (2.25, 2.95] | 18 | 97 | 0.066176 | 0.356618 | 6.617647 |
| (2.95, 3.65] | 15 | 112 | 0.055147 | 0.411765 | 5.514706 |
| (3.65, 4.35] | 77 | 189 | 0.283088 | 0.694853 | 28.308824 |
| (4.35, 5.05] | 81 | 270 | 0.297794 | 0.992647 | 29.779412 |
| (5.05, 5.75] | 2 | 272 | 0.007353 | 1.000000 | 0.735294 |
erup_grup_counts["frec_%acum"] =erup_grup_counts["frec_%"].cumsum()
erup_grup_counts| frecuencia | cum_frequency | rel_frequency | relacum_frequency | frec_% | frec_%acum | |
|---|---|---|---|---|---|---|
| erup_grup | ||||||
| (1.55, 2.25] | 79 | 79 | 0.290441 | 0.290441 | 29.044118 | 29.044118 |
| (2.25, 2.95] | 18 | 97 | 0.066176 | 0.356618 | 6.617647 | 35.661765 |
| (2.95, 3.65] | 15 | 112 | 0.055147 | 0.411765 | 5.514706 | 41.176471 |
| (3.65, 4.35] | 77 | 189 | 0.283088 | 0.694853 | 28.308824 | 69.485294 |
| (4.35, 5.05] | 81 | 270 | 0.297794 | 0.992647 | 29.779412 | 99.264706 |
| (5.05, 5.75] | 2 | 272 | 0.007353 | 1.000000 | 0.735294 | 100.000000 |
#import pandas as pd
#tablafrec=pd.DataFrame({'fa':erup_grup_counts['frecuencia'],'faa':erup_grup_counts['cum_frequency'],'fr':erup_grup_counts['rel_frequency'],'fra':erup_grup_counts['rel_frequency'],'fporcen':erup_grup_counts['frec_%']},'fpa':erup_grup_counts['frec_%acum']})
#media=erup_grup_counts['frecuencia'].sum()
#media# diagrama de dispersión
disp= faithful.plot(kind='scatter', x='eruptions', y='waiting')# media de variable eruptions
faithful['eruptions'].mean()3.4877830882352936
# media geometrica
stats.gmean(faithful['eruptions'])3.2713131325361786
# media armónica
stats.hmean(faithful['eruptions'])3.0389330499472607
# mediana
faithful['eruptions'].median()4.0
# media truncada, recortando el 10 superior e inferior
stats.trim_mean(faithful['eruptions'], .10)3.5298073394495413
# moda
faithful['eruptions'].mode()0 1.867
1 4.500
Name: eruptions, dtype: float64
# varianza
faithful['eruptions'].var()1.302728332849468
# desvio estándar
faithful['eruptions'].std()1.141371251105208
# cuartiles
faithful['eruptions'].quantile([.25, .5, .75])0.25 2.16275
0.50 4.00000
0.75 4.45425
Name: eruptions, dtype: float64
# diagrama de cajas
cajas=sns.boxplot(list(faithful['eruptions']))plt.boxplot(faithful['eruptions'])
plt.show()# covarianza
faithful.cov()| eruptions | waiting | |
|---|---|---|
| eruptions | 1.302728 | 13.977808 |
| waiting | 13.977808 | 184.823312 |
# coeficiente de correlación
faithful.corr()| eruptions | waiting | |
|---|---|---|
| eruptions | 1.000000 | 0.900811 |
| waiting | 0.900811 | 1.000000 |
# resumen estadístico
faithful['eruptions'].describe()count 272.000000
mean 3.487783
std 1.141371
min 1.600000
25% 2.162750
50% 4.000000
75% 4.454250
max 5.100000
Name: eruptions, dtype: float64
par= sns.pairplot(faithful)datos=faithful[faithful['eruptions']>2.55]
datos.head()| eruptions | waiting | erup_grup | |
|---|---|---|---|
| 1 | 3.600 | 79 | (2.95, 3.65] |
| 3 | 3.333 | 74 | (2.95, 3.65] |
| 5 | 4.533 | 85 | (4.35, 5.05] |
| 6 | 2.883 | 55 | (2.25, 2.95] |
| 7 | 4.700 | 88 | (4.35, 5.05] |
# Gráfico
# ==============================================================================
fig, ax = plt.subplots(1, 1, figsize=(6,4))
ax.scatter(x=faithful.waiting, y=faithful.eruptions, alpha= 0.8)
ax.set_xlabel('Altura')
ax.set_ylabel('Peso')Text(0, 0.5, 'Peso')
import statsmodels.api as sm
# Gráfico distribución variables
# ==============================================================================
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))
axs[0].hist(x=faithful.waiting, bins=5, color="#3182bd", alpha=0.5)
axs[0].plot(faithful.waiting, np.full_like(faithful.waiting, -0.01), '|k', markeredgewidth=1)
axs[0].set_title('Distribución (waiting)')
axs[0].set_xlabel('waiting')
axs[0].set_ylabel('counts')
axs[1].hist(x=faithful.eruptions, bins=5, color="#3182bd", alpha=0.5)
axs[1].plot(faithful.eruptions, np.full_like(faithful.eruptions, -0.01), '|k', markeredgewidth=1)
axs[1].set_title('Distribución peso (eruptions)')
axs[1].set_xlabel('eruptions')
axs[1].set_ylabel('counts')Text(0, 0.5, 'counts')
# Gráfico Q-Q
# ==============================================================================
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))
sm.qqplot(
faithful.waiting,
fit = True,
line = 'q',
alpha = 0.4,
lw = 2,
ax = axs[0]
)
axs[0].set_title('Gráfico Q-Q height', fontsize = 10, fontweight = "bold")
axs[0].tick_params(labelsize = 7)
sm.qqplot(
faithful.eruptions,
fit = True,
line = 'q',
alpha = 0.4,
lw = 2,
ax = axs[1]
)
axs[1].set_title('Gráfico Q-Q height', fontsize = 10, fontweight = "bold")
axs[1].tick_params(labelsize = 7)import pingouin as pg
from scipy import stats
# Normalidad de los residuos Shapiro-Wilk test
# ==============================================================================
shapiro_test = stats.shapiro(faithful.waiting)
print(f"Variable height: {shapiro_test}")
shapiro_test = stats.shapiro(faithful.eruptions)
print(f"Variable weight: {shapiro_test}")Variable height: ShapiroResult(statistic=0.922146737575531, pvalue=1.015239073365315e-10)
Variable weight: ShapiroResult(statistic=0.8459155559539795, pvalue=9.03598972142245e-16)
# Cálculo de correlación con Pandas
# ==============================================================================
print('Correlación Pearson: ', faithful['waiting'].corr(faithful['eruptions'], method='pearson'))
print('Correlación spearman: ', faithful['waiting'].corr(faithful['eruptions'], method='spearman'))
print('Correlación kendall: ', faithful['waiting'].corr(faithful['eruptions'], method='kendall'))Correlación Pearson: 0.9008111683218127
Correlación spearman: 0.7779720576516121
Correlación kendall: 0.5747673538950212
# Datos
# ==============================================================================
url = ('https://raw.githubusercontent.com/JoaquinAmatRodrigo/' +
'Estadistica-machine-learning-python/master/data/SaratogaHouses.csv')
datos = pd.read_csv(url, sep=",")
# Se renombran las columnas para que sean más descriptivas
datos.columns = ["precio", "metros_totales", "antiguedad", "precio_terreno",
"metros_habitables", "universitarios", "dormitorios",
"chimenea", "banyos", "habitaciones", "calefaccion",
"consumo_calefacion", "desague", "vistas_lago",
"nueva_construccion", "aire_acondicionado"]
# Variables numéricas
datos = datos.select_dtypes(include=['float64', 'int'])datos.head(20)| precio | metros_totales | antiguedad | precio_terreno | metros_habitables | universitarios | dormitorios | chimenea | banyos | habitaciones | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 132500 | 0.09 | 42 | 50000 | 906 | 35 | 2 | 1 | 1.0 | 5 |
| 1 | 181115 | 0.92 | 0 | 22300 | 1953 | 51 | 3 | 0 | 2.5 | 6 |
| 2 | 109000 | 0.19 | 133 | 7300 | 1944 | 51 | 4 | 1 | 1.0 | 8 |
| 3 | 155000 | 0.41 | 13 | 18700 | 1944 | 51 | 3 | 1 | 1.5 | 5 |
| 4 | 86060 | 0.11 | 0 | 15000 | 840 | 51 | 2 | 0 | 1.0 | 3 |
| 5 | 120000 | 0.68 | 31 | 14000 | 1152 | 22 | 4 | 1 | 1.0 | 8 |
| 6 | 153000 | 0.40 | 33 | 23300 | 2752 | 51 | 4 | 1 | 1.5 | 8 |
| 7 | 170000 | 1.21 | 23 | 14600 | 1662 | 35 | 4 | 1 | 1.5 | 9 |
| 8 | 90000 | 0.83 | 36 | 22200 | 1632 | 51 | 3 | 0 | 1.5 | 8 |
| 9 | 122900 | 1.94 | 4 | 21200 | 1416 | 44 | 3 | 0 | 1.5 | 6 |
| 10 | 325000 | 2.29 | 123 | 12600 | 2894 | 51 | 7 | 0 | 1.0 | 12 |
| 11 | 120000 | 0.92 | 1 | 22300 | 1624 | 51 | 3 | 0 | 2.0 | 6 |
| 12 | 85860 | 8.97 | 13 | 4800 | 704 | 41 | 2 | 0 | 1.0 | 4 |
| 13 | 97000 | 0.11 | 153 | 3100 | 1383 | 57 | 3 | 0 | 2.0 | 5 |
| 14 | 127000 | 0.14 | 9 | 300 | 1300 | 41 | 3 | 0 | 1.5 | 8 |
| 15 | 89900 | 0.00 | 88 | 2500 | 936 | 57 | 3 | 0 | 1.0 | 4 |
| 16 | 155000 | 0.13 | 9 | 300 | 1300 | 41 | 3 | 0 | 1.5 | 7 |
| 17 | 253750 | 2.00 | 0 | 49800 | 2816 | 71 | 4 | 1 | 2.5 | 12 |
| 18 | 60000 | 0.21 | 82 | 8500 | 924 | 35 | 2 | 0 | 1.0 | 6 |
| 19 | 87500 | 0.88 | 17 | 19400 | 1092 | 35 | 3 | 0 | 1.0 | 6 |
# Matriz de correlación
# ==============================================================================
corr_matrix = datos.corr(method='pearson')
corr_matrix| precio | metros_totales | antiguedad | precio_terreno | metros_habitables | universitarios | dormitorios | chimenea | banyos | habitaciones | |
|---|---|---|---|---|---|---|---|---|---|---|
| precio | 1.000000 | 0.158333 | -0.188793 | 0.581266 | 0.712390 | 0.200119 | 0.400349 | 0.376786 | 0.597250 | 0.531170 |
| metros_totales | 0.158333 | 1.000000 | -0.016352 | 0.059222 | 0.163450 | -0.033148 | 0.113982 | 0.085226 | 0.084823 | 0.137604 |
| antiguedad | -0.188793 | -0.016352 | 1.000000 | -0.021818 | -0.174242 | -0.037785 | 0.027125 | -0.172022 | -0.361897 | -0.082264 |
| precio_terreno | 0.581266 | 0.059222 | -0.021818 | 1.000000 | 0.423441 | 0.228427 | 0.202449 | 0.211727 | 0.297498 | 0.298865 |
| metros_habitables | 0.712390 | 0.163450 | -0.174242 | 0.423441 | 1.000000 | 0.209981 | 0.656196 | 0.473788 | 0.718564 | 0.733666 |
| universitarios | 0.200119 | -0.033148 | -0.037785 | 0.228427 | 0.209981 | 1.000000 | 0.162919 | 0.246626 | 0.179541 | 0.157068 |
| dormitorios | 0.400349 | 0.113982 | 0.027125 | 0.202449 | 0.656196 | 0.162919 | 1.000000 | 0.284475 | 0.458033 | 0.671863 |
| chimenea | 0.376786 | 0.085226 | -0.172022 | 0.211727 | 0.473788 | 0.246626 | 0.284475 | 1.000000 | 0.436234 | 0.319894 |
| banyos | 0.597250 | 0.084823 | -0.361897 | 0.297498 | 0.718564 | 0.179541 | 0.458033 | 0.436234 | 1.000000 | 0.517585 |
| habitaciones | 0.531170 | 0.137604 | -0.082264 | 0.298865 | 0.733666 | 0.157068 | 0.671863 | 0.319894 | 0.517585 | 1.000000 |
# Heatmap matriz de correlaciones
# ==============================================================================
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(5, 5))
sns.heatmap(
corr_matrix,
annot = True,
cbar = False,
annot_kws = {"size": 8},
vmin = -1,
vmax = 1,
center = 0,
cmap = sns.diverging_palette(20, 220, n=200),
square = True,
ax = ax
)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation = 45,
horizontalalignment = 'right',
)
ax.tick_params(labelsize = 10)