A base do código vem de https://www.statsmodels.org/dev/examples/notebooks/generated/ols.html.

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.datasets.longley import load_pandas

y = load_pandas().endog # variavel endogena y
X = load_pandas().exog  # variaveis exogenas X
X = sm.add_constant(X)  # adiciona a constante em X

Resultado da regressão de Y = f(X)

#from statsmodels.formula.api import ols
ols_model = sm.OLS(y, X)
ols_results = ols_model.fit()
print(ols_results.summary())

##                             OLS Regression Results                            
## ==============================================================================
## Dep. Variable:                 TOTEMP   R-squared:                       0.995
## Model:                            OLS   Adj. R-squared:                  0.992
## Method:                 Least Squares   F-statistic:                     330.3
## Date:                sex, 24 mar 2023   Prob (F-statistic):           4.98e-10
## Time:                        19:33:11   Log-Likelihood:                -109.62
## No. Observations:                  16   AIC:                             233.2
## Df Residuals:                       9   BIC:                             238.6
## Df Model:                           6                                         
## Covariance Type:            nonrobust                                         
## ==============================================================================
##                  coef    std err          t      P>|t|      [0.025      0.975]
## ------------------------------------------------------------------------------
## const      -3.482e+06    8.9e+05     -3.911      0.004    -5.5e+06   -1.47e+06
## GNPDEFL       15.0619     84.915      0.177      0.863    -177.029     207.153
## GNP           -0.0358      0.033     -1.070      0.313      -0.112       0.040
## UNEMP         -2.0202      0.488     -4.136      0.003      -3.125      -0.915
## ARMED         -1.0332      0.214     -4.822      0.001      -1.518      -0.549
## POP           -0.0511      0.226     -0.226      0.826      -0.563       0.460
## YEAR        1829.1515    455.478      4.016      0.003     798.788    2859.515
## ==============================================================================
## Omnibus:                        0.749   Durbin-Watson:                   2.559
## Prob(Omnibus):                  0.688   Jarque-Bera (JB):                0.684
## Skew:                           0.420   Prob(JB):                        0.710
## Kurtosis:                       2.434   Cond. No.                     4.86e+09
## ==============================================================================
## 
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
## [2] The condition number is large, 4.86e+09. This might indicate that there are
## strong multicollinearity or other numerical problems.
## 
## C:\Users\amrof\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\stats\_stats_py.py:1477: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=16
##   warnings.warn("kurtosistest only valid for n>=20 ... continuing "

print("Valores previstos: ", ols_results.predict())

## Valores previstos:  [60055.65997023 61216.01394239 60124.71283224 61597.11462192
##  62911.28540923 63888.31121532 65153.04895639 63774.18035686
##  66004.69522739 67401.60590544 68186.26892711 66552.05504251
##  68810.54997358 69649.67130803 68989.06848603 70757.75782518]

Plot para comparar previsão por MQO com valores observados. Intervalos de Confiança em torno das previsões foram construídas com o comando wls_prediction_std.

opcao 2

import statsmodels.api as sm

# Longley's Economic Regression Data
longley = sm.datasets.get_rdataset('longley')
# Extract the data
longley = longley.data
# Subset
df = longley['Employed']

Explorando o dataset

import matplotlib.pyplot as plt

# Make figures A6 in size
A = 6
plt.rc('figure', figsize=[46.82 * .5**(.5 * A), 33.11 * .5**(.5 * A)])
# Image quality
plt.rc('figure', dpi=141)
# Be able to add Latex
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
plt.rc('text.latex', preamble=r'\usepackage{textgreek}')

# Plot
import matplotlib.pyplot as plt
plt.scatter(df.index, df.values, c='k', s=10)
plt.axhline(0, c='k', lw=0.5)
plt.title('Longley Dataset')
plt.xlabel('Ano')
plt.ylabel(r'Mudança no emprego [%]')

## Regressão

import statsmodels.formula.api as smf

# Convert the data from a series to a data frame
df = longley

import numpy as np

# Sample size
n = df.shape[0]

# Add a column of constants
#df['ones'] = [1] * n

# Perform a regression
ols_model2 = smf.ols('Employed ~ GNP+Unemployed+ Year+ Population', data=df)
ols_results2 = ols_model2.fit()
print(ols_results2.summary())

##                             OLS Regression Results                            
## ==============================================================================
## Dep. Variable:               Employed   R-squared:                       0.983
## Model:                            OLS   Adj. R-squared:                  0.976
## Method:                 Least Squares   F-statistic:                     154.4
## Date:                sex, 24 mar 2023   Prob (F-statistic):           1.39e-09
## Time:                        19:33:28   Log-Likelihood:                -9.9192
## No. Observations:                  16   AIC:                             29.84
## Df Residuals:                      11   BIC:                             33.70
## Df Model:                           4                                         
## Covariance Type:            nonrobust                                         
## ==============================================================================
##                  coef    std err          t      P>|t|      [0.025      0.975]
## ------------------------------------------------------------------------------
## Intercept  -1142.0831   1323.350     -0.863      0.407   -4054.757    1770.591
## GNP            0.0120      0.043      0.281      0.784      -0.082       0.106
## Unemployed    -0.0085      0.006     -1.398      0.190      -0.022       0.005
## Year           0.6184      0.677      0.913      0.381      -0.872       2.109
## Population    -0.0273      0.302     -0.090      0.930      -0.692       0.637
## ==============================================================================
## Omnibus:                        0.142   Durbin-Watson:                   1.058
## Prob(Omnibus):                  0.931   Jarque-Bera (JB):                0.182
## Skew:                           0.166   Prob(JB):                        0.913
## Kurtosis:                       2.596   Cond. No.                     1.97e+07
## ==============================================================================
## 
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
## [2] The condition number is large, 1.97e+07. This might indicate that there are
## strong multicollinearity or other numerical problems.
## 
## C:\Users\amrof\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\stats\_stats_py.py:1477: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=16
##   warnings.warn("kurtosistest only valid for n>=20 ... continuing "

Longley data com Python

Adriano Marcos R Figueiredo

2023-03-24

opcao 2

Explorando o dataset