CLASE 9. Regresión Logística Binaria Múltiple en Python

Autor/a

Gerson Rivera

Fecha de publicación

25 julio 2024

APLICACIÓN GML

Importando librerias

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score
from sklearn.metrics import confusion_matrix

NOTA. No olvide subir a su colab la base de datos utilizada “heart\_failure\_clinical\_records\_dataset.csv” o leerla directamente desde el software.

base="heart_failure_clinical_records_dataset.csv"
datos=pd.read_csv(base)
datos.head()

	age	anaemia	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	smoking	time	DEATH_EVENT
0	75.0	0	582	0	20	1	265000.00	1.9	130	1	0	4	1
1	55.0	0	7861	0	38	0	263358.03	1.1	136	1	0	6	1
2	65.0	0	146	0	20	0	162000.00	1.3	129	1	1	7	1
3	50.0	1	111	0	20	0	210000.00	1.9	137	1	0	7	1
4	65.0	1	160	1	20	0	327000.00	2.7	116	0	0	8	1

heart_failure=pd.read_csv('heart_failure_clinical_records_dataset.csv')
heart_failure.head(15)

	age	anaemia	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	smoking	time	DEATH_EVENT
0	75.0	0	582	0	20	1	265000.00	1.9	130	1	0	4	1
1	55.0	0	7861	0	38	0	263358.03	1.1	136	1	0	6	1
2	65.0	0	146	0	20	0	162000.00	1.3	129	1	1	7	1
3	50.0	1	111	0	20	0	210000.00	1.9	137	1	0	7	1
4	65.0	1	160	1	20	0	327000.00	2.7	116	0	0	8	1
5	90.0	1	47	0	40	1	204000.00	2.1	132	1	1	8	1
6	75.0	1	246	0	15	0	127000.00	1.2	137	1	0	10	1
7	60.0	1	315	1	60	0	454000.00	1.1	131	1	1	10	1
8	65.0	0	157	0	65	0	263358.03	1.5	138	0	0	10	1
9	80.0	1	123	0	35	1	388000.00	9.4	133	1	1	10	1
10	75.0	1	81	0	38	1	368000.00	4.0	131	1	1	10	1
11	62.0	0	231	0	25	1	253000.00	0.9	140	1	1	10	1
12	45.0	1	981	0	30	0	136000.00	1.1	137	1	0	11	1
13	50.0	1	168	0	38	1	276000.00	1.1	137	1	0	11	1
14	49.0	1	80	0	30	1	427000.00	1.0	138	0	0	12	0

heart_failure.dtypes

age                         float64
anaemia                       int64
creatinine_phosphokinase      int64
diabetes                      int64
ejection_fraction             int64
high_blood_pressure           int64
platelets                   float64
serum_creatinine            float64
serum_sodium                  int64
sex                           int64
smoking                       int64
time                          int64
DEATH_EVENT                   int64
dtype: object

heart_failure.shape

(299, 13)

heart_failure.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

cols=['age', 'creatinine_phosphokinase',
       'ejection_fraction', 'platelets',
       'serum_creatinine', 'serum_sodium' ,'time']
heart_failure[cols].corr()

	age	creatinine_phosphokinase	ejection_fraction	platelets	serum_creatinine	serum_sodium	time
age	1.000000	-0.081584	0.060098	-0.052354	0.159187	-0.045966	-0.224068
creatinine_phosphokinase	-0.081584	1.000000	-0.044080	0.024463	-0.016408	0.059550	-0.009346
ejection_fraction	0.060098	-0.044080	1.000000	0.072177	-0.011302	0.175902	0.041729
platelets	-0.052354	0.024463	0.072177	1.000000	-0.041198	0.062125	0.010514
serum_creatinine	0.159187	-0.016408	-0.011302	-0.041198	1.000000	-0.189095	-0.149315
serum_sodium	-0.045966	0.059550	0.175902	0.062125	-0.189095	1.000000	0.087640
time	-0.224068	-0.009346	0.041729	0.010514	-0.149315	0.087640	1.000000

from sklearn.model_selection import train_test_split
X=heart_failure.drop(columns='DEATH_EVENT')
y=heart_failure.DEATH_EVENT

X.head()

	age	anaemia	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	smoking	time
0	75.0	0	582	0	20	1	265000.00	1.9	130	1	0	4
1	55.0	0	7861	0	38	0	263358.03	1.1	136	1	0	6
2	65.0	0	146	0	20	0	162000.00	1.3	129	1	1	7
3	50.0	1	111	0	20	0	210000.00	1.9	137	1	0	7
4	65.0	1	160	1	20	0	327000.00	2.7	116	0	0	8

y.head()

0    1
1    1
2    1
3    1
4    1
Name: DEATH_EVENT, dtype: int64

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.3)

model=LogisticRegression()
mod=model.fit(X_train,y_train)

C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

mod.coef_

array([[ 6.35894825e-02,  3.87147883e-04,  3.10777030e-04,
         7.28439217e-04, -8.22673452e-02, -6.62636905e-04,
        -8.04069151e-07,  5.61481692e-03,  3.37743333e-03,
        -1.03768123e-03, -2.88449178e-04, -1.94393137e-02]])

mod.intercept_

array([0.00020332])

X.head(1)

	age	anaemia	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	smoking	time
0	75.0	0	582	0	20	1	265000.0	1.9	130	1	0	4

y.head(1)

0    1
Name: DEATH_EVENT, dtype: int64

mod.predict([[75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4]])

C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
  warnings.warn(

array([1])

predictions=model.predict(X_test)

accuracyTrain=model.score(X_train,y_train)
print(accuracyTrain)

0.84688995215311

accuracyTest=model.score(X_test,y_test)
print(accuracyTest)

0.7888888888888889

from sklearn.model_selection import cross_val_score
scores=cross_val_score(model,X_train,y_train,cv=5).mean()
scores

C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

np.float64(0.8232288037166086)

cm=confusion_matrix(y_test,predictions) #Creando la matriz de confusión
cm

array([[58,  4],
       [15, 13]])

CREAR EL MODELO CON STATSMODELS

import statsmodels.api as sm

X_m=sm.add_constant(X_train)
X_m.head()

	const	age	anaemia	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	smoking	time
281	1.0	70.0	0	582	0	40	0	51000.0	2.7	136	1	1	250
44	1.0	60.0	1	588	1	60	0	194000.0	1.1	142	0	0	33
241	1.0	65.0	0	582	1	30	0	249000.0	1.3	136	1	1	212
189	1.0	40.0	0	244	0	45	1	275000.0	0.9	140	0	0	174
152	1.0	50.0	0	115	0	45	1	184000.0	0.9	134	1	1	118

logit=sm.Logit(y_train,X_m)
result=logit.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.351414
         Iterations 7

Logit Regression Results
Dep. Variable:	DEATH_EVENT	No. Observations:	209
Model:	Logit	Df Residuals:	196
Method:	MLE	Df Model:	12
Date:	Thu, 25 Jul 2024	Pseudo R-squ.:	0.4429
Time:	22:10:43	Log-Likelihood:	-73.445
converged:	True	LL-Null:	-131.85
Covariance Type:	nonrobust	LLR p-value:	2.679e-19

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	11.4142	6.722	1.698	0.089	-1.760	24.588
age	0.0663	0.021	3.143	0.002	0.025	0.108
anaemia	0.0687	0.434	0.158	0.874	-0.781	0.918
creatinine_phosphokinase	0.0004	0.000	1.563	0.118	-0.000	0.001
diabetes	0.2499	0.436	0.573	0.567	-0.605	1.105
ejection_fraction	-0.0883	0.022	-3.980	0.000	-0.132	-0.045
high_blood_pressure	-0.3556	0.454	-0.783	0.433	-1.245	0.534
platelets	-1.213e-06	2.23e-06	-0.543	0.587	-5.59e-06	3.17e-06
serum_creatinine	0.4064	0.265	1.535	0.125	-0.113	0.925
serum_sodium	-0.0792	0.047	-1.702	0.089	-0.170	0.012
sex	-1.0197	0.508	-2.009	0.045	-2.015	-0.025
smoking	0.2383	0.529	0.450	0.653	-0.799	1.276
time	-0.0206	0.004	-5.554	0.000	-0.028	-0.013

cols=['anaemia','smoking']

np.exp(-0.0206)

np.float64(0.9796107305032505)

X_m=X_m.drop(columns=cols)

X_m.head()

	const	age	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	time
281	1.0	70.0	582	0	40	0	51000.0	2.7	136	1	250
44	1.0	60.0	588	1	60	0	194000.0	1.1	142	0	33
241	1.0	65.0	582	1	30	0	249000.0	1.3	136	1	212
189	1.0	40.0	244	0	45	1	275000.0	0.9	140	0	174
152	1.0	50.0	115	0	45	1	184000.0	0.9	134	1	118

logit2=sm.Logit(y_train,X_m)
result2=logit2.fit()
result2.summary()

Optimization terminated successfully.
         Current function value: 0.351924
         Iterations 7

Logit Regression Results
Dep. Variable:	DEATH_EVENT	No. Observations:	209
Model:	Logit	Df Residuals:	198
Method:	MLE	Df Model:	10
Date:	Thu, 25 Jul 2024	Pseudo R-squ.:	0.4421
Time:	22:10:43	Log-Likelihood:	-73.552
converged:	True	LL-Null:	-131.85
Covariance Type:	nonrobust	LLR p-value:	2.488e-20

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	11.4122	6.723	1.697	0.090	-1.765	24.589
age	0.0659	0.021	3.130	0.002	0.025	0.107
creatinine_phosphokinase	0.0004	0.000	1.515	0.130	-0.000	0.001
diabetes	0.2374	0.436	0.544	0.586	-0.617	1.092
ejection_fraction	-0.0887	0.022	-4.005	0.000	-0.132	-0.045
high_blood_pressure	-0.3588	0.452	-0.793	0.428	-1.245	0.528
platelets	-1.156e-06	2.23e-06	-0.519	0.604	-5.52e-06	3.21e-06
serum_creatinine	0.3924	0.262	1.497	0.134	-0.121	0.906
serum_sodium	-0.0783	0.046	-1.686	0.092	-0.169	0.013
sex	-0.9332	0.457	-2.044	0.041	-1.828	-0.038
time	-0.0206	0.004	-5.615	0.000	-0.028	-0.013

cols=['platelets','diabetes']

X_m=X_m.drop(columns=cols)

X_m.head()

	const	age	creatinine_phosphokinase	ejection_fraction	high_blood_pressure	serum_creatinine	serum_sodium	sex	time
281	1.0	70.0	582	40	0	2.7	136	1	250
44	1.0	60.0	588	60	0	1.1	142	0	33
241	1.0	65.0	582	30	0	1.3	136	1	212
189	1.0	40.0	244	45	1	0.9	140	0	174
152	1.0	50.0	115	45	1	0.9	134	1	118

logit3=sm.Logit(y_train,X_m)
resul3=logit3.fit()
resul3.summary()

Optimization terminated successfully.
         Current function value: 0.353362
         Iterations 7

Logit Regression Results
Dep. Variable:	DEATH_EVENT	No. Observations:	209
Model:	Logit	Df Residuals:	200
Method:	MLE	Df Model:	8
Date:	Thu, 25 Jul 2024	Pseudo R-squ.:	0.4399
Time:	22:10:43	Log-Likelihood:	-73.853
converged:	True	LL-Null:	-131.85
Covariance Type:	nonrobust	LLR p-value:	2.231e-21

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	12.1281	6.492	1.868	0.062	-0.595	24.852
age	0.0642	0.020	3.135	0.002	0.024	0.104
creatinine_phosphokinase	0.0004	0.000	1.523	0.128	-0.000	0.001
ejection_fraction	-0.0884	0.022	-4.022	0.000	-0.131	-0.045
high_blood_pressure	-0.3875	0.450	-0.862	0.389	-1.269	0.494
serum_creatinine	0.3859	0.260	1.485	0.137	-0.123	0.895
serum_sodium	-0.0842	0.045	-1.854	0.064	-0.173	0.005
sex	-0.9296	0.448	-2.074	0.038	-1.808	-0.051
time	-0.0206	0.004	-5.606	0.000	-0.028	-0.013

cols=['high_blood_pressure','creatinine_phosphokinase']
X_m=X_m.drop(columns=cols)
X_m.head()

	const	age	ejection_fraction	serum_creatinine	serum_sodium	sex	time
281	1.0	70.0	40	2.7	136	1	250
44	1.0	60.0	60	1.1	142	0	33
241	1.0	65.0	30	1.3	136	1	212
189	1.0	40.0	45	0.9	140	0	174
152	1.0	50.0	45	0.9	134	1	118

logit4=sm.Logit(y_train,X_m)
resul4=logit4.fit()
resul4.summary()

Optimization terminated successfully.
         Current function value: 0.363403
         Iterations 7

Logit Regression Results
Dep. Variable:	DEATH_EVENT	No. Observations:	209
Model:	Logit	Df Residuals:	202
Method:	MLE	Df Model:	6
Date:	Thu, 25 Jul 2024	Pseudo R-squ.:	0.4239
Time:	22:10:43	Log-Likelihood:	-75.951
converged:	True	LL-Null:	-131.85
Covariance Type:	nonrobust	LLR p-value:	8.597e-22

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	11.5137	6.391	1.802	0.072	-1.012	24.039
age	0.0580	0.019	3.006	0.003	0.020	0.096
ejection_fraction	-0.0869	0.022	-4.023	0.000	-0.129	-0.045
serum_creatinine	0.3977	0.251	1.584	0.113	-0.094	0.890
serum_sodium	-0.0782	0.045	-1.739	0.082	-0.166	0.010
sex	-0.7295	0.430	-1.695	0.090	-1.573	0.114
time	-0.0198	0.003	-5.663	0.000	-0.027	-0.013

cols=['serum_creatinine','sex']
X_m=X_m.drop(columns=cols)
X_m.head()

	const	age	ejection_fraction	serum_sodium	time
281	1.0	70.0	40	136	250
44	1.0	60.0	60	142	33
241	1.0	65.0	30	136	212
189	1.0	40.0	45	140	174
152	1.0	50.0	45	134	118

logit5=sm.Logit(y_train,X_m)
resul5=logit5.fit()
resul5.summary()

Optimization terminated successfully.
         Current function value: 0.376528
         Iterations 7

Logit Regression Results
Dep. Variable:	DEATH_EVENT	No. Observations:	209
Model:	Logit	Df Residuals:	204
Method:	MLE	Df Model:	4
Date:	Thu, 25 Jul 2024	Pseudo R-squ.:	0.4031
Time:	22:10:43	Log-Likelihood:	-78.694
converged:	True	LL-Null:	-131.85
Covariance Type:	nonrobust	LLR p-value:	4.467e-22

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	13.1838	6.067	2.173	0.030	1.293	25.074
age	0.0571	0.019	3.083	0.002	0.021	0.093
ejection_fraction	-0.0847	0.021	-4.029	0.000	-0.126	-0.043
serum_sodium	-0.0894	0.043	-2.075	0.038	-0.174	-0.005
time	-0.0203	0.003	-5.876	0.000	-0.027	-0.014

nd={
    'const':[1.0],
    'age':[50],
    'ejection_fraction':[40],
    'serum_sodium':[136],
    'time':[100]
}

nd=pd.DataFrame(nd,index=['Gerson'])

resul5.predict(nd)

Gerson    0.175449
dtype: float64

from sklearn.metrics import confusion_matrix,accuracy_score

Desempeño del modelo

colu=['age','ejection_fraction','serum_sodium','time']

X_test_m=sm.add_constant(X_test[colu])

pred_train=resul5.predict(X_m)
pred_train.head()

281    0.030590
44     0.136501
241    0.106965
189    0.012086
152    0.103600
dtype: float64

pred_train=pred_train.apply(lambda x:1 if x>0.5 else 0)
pred_train.head()

281    0
44     0
241    0
189    0
152    0
dtype: int64

pred_test=resul5.predict(X_test_m)
pred_test.head()

206    0.012945
188    0.082967
12     0.675642
219    0.040358
237    0.191790
dtype: float64

pred_test=pred_test.apply(lambda x:1 if x>0.5 else 0)
pred_test.head()

206    0
188    0
12     1
219    0
237    0
dtype: int64

from sklearn.metrics import accuracy_score
accuracyTrain=accuracy_score(y_train,pred_train)
accuracyTrain

0.8564593301435407

from sklearn.metrics import accuracy_score
accuracyTest=accuracy_score(y_test,pred_test)
accuracyTest

0.8222222222222222

Forma alternativa. Regresión Logística en Python

X_m.columns

Index(['const', 'age', 'ejection_fraction', 'serum_sodium', 'time'], dtype='object')

logit_glm=sm.GLM(y,X,family=sm.families.Binomial()).fit()
logit_glm.summary()

Generalized Linear Model Regression Results
Dep. Variable:	DEATH_EVENT	No. Observations:	299
Model:	GLM	Df Residuals:	287
Model Family:	Binomial	Df Model:	11
Link Function:	Logit	Scale:	1.0000
Method:	IRLS	Log-Likelihood:	-111.43
Date:	Thu, 25 Jul 2024	Deviance:	222.85
Time:	22:10:43	Pearson chi2:	274.
No. Iterations:	6	Pseudo R-squ. (CS):	0.3995
Covariance Type:	nonrobust

	coef	std err	z	P>\|z\|	[0.025	0.975]
age	0.0523	0.015	3.387	0.001	0.022	0.083
anaemia	-0.0374	0.357	-0.105	0.916	-0.737	0.662
creatinine_phosphokinase	0.0002	0.000	1.227	0.220	-0.000	0.001
diabetes	0.2460	0.345	0.714	0.476	-0.430	0.922
ejection_fraction	-0.0757	0.016	-4.724	0.000	-0.107	-0.044
high_blood_pressure	-0.0773	0.355	-0.218	0.828	-0.774	0.619
platelets	-1.109e-06	1.83e-06	-0.606	0.544	-4.69e-06	2.48e-06
serum_creatinine	0.7226	0.180	4.012	0.000	0.370	1.076
serum_sodium	0.0032	0.009	0.351	0.725	-0.015	0.021
sex	-0.4456	0.406	-1.099	0.272	-1.241	0.349
smoking	-0.0163	0.409	-0.040	0.968	-0.817	0.785
time	-0.0204	0.003	-6.947	0.000	-0.026	-0.015