A base do código vem de https://www.statsmodels.org/dev/examples/notebooks/generated/ols.html.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.datasets.longley import load_pandas
y = load_pandas().endog # variavel endogena y
X = load_pandas().exog # variaveis exogenas X
X = sm.add_constant(X) # adiciona a constante em X
Resultado da regressão de Y = f(X)
#from statsmodels.formula.api import ols
ols_model = sm.OLS(y, X)
ols_results = ols_model.fit()
print(ols_results.summary())
## OLS Regression Results
## ==============================================================================
## Dep. Variable: TOTEMP R-squared: 0.995
## Model: OLS Adj. R-squared: 0.992
## Method: Least Squares F-statistic: 330.3
## Date: sex, 24 mar 2023 Prob (F-statistic): 4.98e-10
## Time: 19:33:11 Log-Likelihood: -109.62
## No. Observations: 16 AIC: 233.2
## Df Residuals: 9 BIC: 238.6
## Df Model: 6
## Covariance Type: nonrobust
## ==============================================================================
## coef std err t P>|t| [0.025 0.975]
## ------------------------------------------------------------------------------
## const -3.482e+06 8.9e+05 -3.911 0.004 -5.5e+06 -1.47e+06
## GNPDEFL 15.0619 84.915 0.177 0.863 -177.029 207.153
## GNP -0.0358 0.033 -1.070 0.313 -0.112 0.040
## UNEMP -2.0202 0.488 -4.136 0.003 -3.125 -0.915
## ARMED -1.0332 0.214 -4.822 0.001 -1.518 -0.549
## POP -0.0511 0.226 -0.226 0.826 -0.563 0.460
## YEAR 1829.1515 455.478 4.016 0.003 798.788 2859.515
## ==============================================================================
## Omnibus: 0.749 Durbin-Watson: 2.559
## Prob(Omnibus): 0.688 Jarque-Bera (JB): 0.684
## Skew: 0.420 Prob(JB): 0.710
## Kurtosis: 2.434 Cond. No. 4.86e+09
## ==============================================================================
##
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
## [2] The condition number is large, 4.86e+09. This might indicate that there are
## strong multicollinearity or other numerical problems.
##
## C:\Users\amrof\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\stats\_stats_py.py:1477: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=16
## warnings.warn("kurtosistest only valid for n>=20 ... continuing "
print("Valores previstos: ", ols_results.predict())
## Valores previstos: [60055.65997023 61216.01394239 60124.71283224 61597.11462192
## 62911.28540923 63888.31121532 65153.04895639 63774.18035686
## 66004.69522739 67401.60590544 68186.26892711 66552.05504251
## 68810.54997358 69649.67130803 68989.06848603 70757.75782518]
Plot para comparar previsão por MQO com valores observados.
Intervalos de Confiança em torno das previsões foram construídas com o
comando wls_prediction_std
.
import statsmodels.api as sm
# Longley's Economic Regression Data
longley = sm.datasets.get_rdataset('longley')
# Extract the data
longley = longley.data
# Subset
df = longley['Employed']
import matplotlib.pyplot as plt
# Make figures A6 in size
A = 6
plt.rc('figure', figsize=[46.82 * .5**(.5 * A), 33.11 * .5**(.5 * A)])
# Image quality
plt.rc('figure', dpi=141)
# Be able to add Latex
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
plt.rc('text.latex', preamble=r'\usepackage{textgreek}')
# Plot
import matplotlib.pyplot as plt
plt.scatter(df.index, df.values, c='k', s=10)
plt.axhline(0, c='k', lw=0.5)
plt.title('Longley Dataset')
plt.xlabel('Ano')
plt.ylabel(r'Mudança no emprego [%]')
## Regressão
import statsmodels.formula.api as smf
# Convert the data from a series to a data frame
df = longley
import numpy as np
# Sample size
n = df.shape[0]
# Add a column of constants
#df['ones'] = [1] * n
# Perform a regression
ols_model2 = smf.ols('Employed ~ GNP+Unemployed+ Year+ Population', data=df)
ols_results2 = ols_model2.fit()
print(ols_results2.summary())
## OLS Regression Results
## ==============================================================================
## Dep. Variable: Employed R-squared: 0.983
## Model: OLS Adj. R-squared: 0.976
## Method: Least Squares F-statistic: 154.4
## Date: sex, 24 mar 2023 Prob (F-statistic): 1.39e-09
## Time: 19:33:28 Log-Likelihood: -9.9192
## No. Observations: 16 AIC: 29.84
## Df Residuals: 11 BIC: 33.70
## Df Model: 4
## Covariance Type: nonrobust
## ==============================================================================
## coef std err t P>|t| [0.025 0.975]
## ------------------------------------------------------------------------------
## Intercept -1142.0831 1323.350 -0.863 0.407 -4054.757 1770.591
## GNP 0.0120 0.043 0.281 0.784 -0.082 0.106
## Unemployed -0.0085 0.006 -1.398 0.190 -0.022 0.005
## Year 0.6184 0.677 0.913 0.381 -0.872 2.109
## Population -0.0273 0.302 -0.090 0.930 -0.692 0.637
## ==============================================================================
## Omnibus: 0.142 Durbin-Watson: 1.058
## Prob(Omnibus): 0.931 Jarque-Bera (JB): 0.182
## Skew: 0.166 Prob(JB): 0.913
## Kurtosis: 2.596 Cond. No. 1.97e+07
## ==============================================================================
##
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
## [2] The condition number is large, 1.97e+07. This might indicate that there are
## strong multicollinearity or other numerical problems.
##
## C:\Users\amrof\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\stats\_stats_py.py:1477: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=16
## warnings.warn("kurtosistest only valid for n>=20 ... continuing "