Abstract
This is an undergrad student level exercise for class use.This work is licensed under the Creative Commons Attribution-ShareAlike 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
License: CC BY-SA 4.0
Sugestão de citação: FIGUEIREDO, Adriano Marcos Rodrigues. Econometria: estatísticas descritivas em R e Python. Campo Grande-MS,Brasil: RStudio/Rpubs, 2022. Disponível em http://rpubs.com/amrofi/estat_descritivas_R_Python.
Os primeiros passos são criar ou abrir um diretório de trabalho. Se
optar por criar um novo projeto, haverá a possibilidade de criar em uma
pasta vazia. Em seguida, sugere-se que coloque os dados nesta pasta, se
possível em um arquivo MS Excel e chame a planilha de ‘dados’.Neste
caso, a planilha chama-se
gujarati 5ed p236 frangos tabela7_9.xlsx
.
7.19. Demanda por frangos nos Estados Unidos, 1960-1982. Para estudar o consumo per capita de frango nos Estados Unidos, use os dados da Tabela 7.9, em que Y = consumo per capita de frango em libras (peso) X2= renda real disponível per capita, em $ X3= preço real do frango no varejo, em centavos de dólar por libra (peso) ¢ X4= preço real da carne suína no varejo, em centavos de dólar por libra (peso) ¢ X5= preço real da carne bovina no varejo, em centavos de dólar por libra (peso) ¢ X6= preço real dos substitutos da carne de frango, em centavos de dólar por libra (peso), que é uma média ponderada dos preços reais das carnes suína e bovina, usando como pesos o consumo relativo de cada uma dessas carnes em relação ao consumo total delas.
A opção do dput() pode ser obtida abaixo.
dados <- structure(list(YEAR = c(1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967,
1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980,
1981, 1982), Y = c(27.8, 29.9, 29.8, 30.8, 31.2, 33.3, 35.6, 36.4, 36.7, 38.4,
40.4, 40.3, 41.8, 40.4, 40.7, 40.1, 42.7, 44.1, 46.7, 50.6, 50.1, 51.7, 52.9),
X2 = c(397.5, 413.3, 439.2, 459.7, 492.9, 528.6, 560.3, 624.6, 666.4, 717.8,
768.2, 843.3, 911.6, 931.1, 1021.5, 1165.9, 1349.6, 1449.4, 1575.5, 1759.1,
1994.2, 2258.1, 2478.7), X3 = c(42.2, 38.1, 40.3, 39.5, 37.3, 38.1, 39.3,
37.8, 38.4, 40.1, 38.6, 39.8, 39.7, 52.1, 48.9, 58.3, 57.9, 56.5, 63.7, 61.6,
58.9, 66.4, 70.4), X4 = c(50.7, 52, 54, 55.3, 54.7, 63.7, 69.8, 65.9, 64.5,
70, 73.2, 67.8, 79.1, 95.4, 94.2, 123.5, 129.9, 117.6, 130.9, 129.8, 128,
141, 168.2), X5 = c(78.3, 79.2, 79.2, 79.2, 77.4, 80.2, 80.4, 83.9, 85.5,
93.7, 106.1, 104.8, 114, 124.1, 127.6, 142.9, 143.6, 139.2, 165.5, 203.3,
219.6, 221.6, 232.6), X6 = c(65.8, 66.9, 67.8, 69.6, 68.7, 73.6, 76.3, 77.2,
78.1, 84.7, 93.3, 89.7, 100.7, 113.5, 115.3, 136.7, 139.2, 132, 132.1, 154.4,
174.9, 180.8, 189.4)), row.names = c(NA, -23L), class = c("tbl_df", "tbl",
"data.frame"))
attach(dados)
library(tidyverse)
# Transforma em tibble
dados_tbl <- dados %>%
as_tibble
# Ver a estrutura dos dados
str(dados_tbl)
tibble [23 x 7] (S3: tbl_df/tbl/data.frame)
$ YEAR: num [1:23] 1960 1961 1962 1963 1964 ...
$ Y : num [1:23] 27.8 29.9 29.8 30.8 31.2 33.3 35.6 36.4 36.7 38.4 ...
$ X2 : num [1:23] 398 413 439 460 493 ...
$ X3 : num [1:23] 42.2 38.1 40.3 39.5 37.3 38.1 39.3 37.8 38.4 40.1 ...
$ X4 : num [1:23] 50.7 52 54 55.3 54.7 63.7 69.8 65.9 64.5 70 ...
$ X5 : num [1:23] 78.3 79.2 79.2 79.2 77.4 80.2 80.4 83.9 85.5 93.7 ...
$ X6 : num [1:23] 65.8 66.9 67.8 69.6 68.7 73.6 76.3 77.2 78.1 84.7 ...
# Média de Y
minhavar <- dados_tbl$Y
mean(minhavar)
[1] 39.66957
# Mediana
median(minhavar)
[1] 40.3
# Moda
table(minhavar) %>%
sort(decreasing = TRUE)
minhavar
40.4 27.8 29.8 29.9 30.8 31.2 33.3 35.6 36.4 36.7 38.4 40.1 40.3 40.7 41.8 42.7
2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
44.1 46.7 50.1 50.6 51.7 52.9
1 1 1 1 1 1
# Variância
var(minhavar)
[1] 54.3604
# Desvio padrão
sd(minhavar)
[1] 7.37295
# Covariância
minhavar2 <- dados_tbl$X3 # X3
minhavar3 <- dados_tbl$X5 # X5
cov(minhavar2, minhavar3)
[1] 531.5792
cov(dados_tbl)
YEAR Y X2 X3 X4 X5
YEAR 46.00000 48.91818 3935.518 66.55455 226.2727 322.5182
Y 48.91818 54.36040 4314.699 68.84850 236.9505 355.1592
X2 3935.51818 4314.69935 381734.940 6399.46575 20829.9032 31369.5975
X3 66.55455 68.84850 6399.466 123.59225 379.8850 531.5792
X4 226.27273 236.95045 20829.903 379.88500 1240.7082 1706.1977
X5 322.51818 355.15915 31369.597 531.57923 1706.1977 2652.2231
X6 257.46364 275.17907 24175.211 418.07480 1364.5050 2016.3027
X6
YEAR 257.4636
Y 275.1791
X2 24175.2107
X3 418.0748
X4 1364.5050
X5 2016.3027
X6 1585.2080
# Correlação
cor(minhavar2, minhavar3)
[1] 0.9284689
cor(dados_tbl)
YEAR Y X2 X3 X4 X5 X6
YEAR 1.0000000 0.9782505 0.9391653 0.8826798 0.9471494 0.9233583 0.9534411
Y 0.9782505 1.0000000 0.9471708 0.8399579 0.9123919 0.9353554 0.9374130
X2 0.9391653 0.9471708 1.0000000 0.9316808 0.9571312 0.9858775 0.9827571
X3 0.8826798 0.8399579 0.9316808 1.0000000 0.9701116 0.9284689 0.9445289
X4 0.9471494 0.9123919 0.9571312 0.9701116 1.0000000 0.9405665 0.9729649
X5 0.9233583 0.9353554 0.9858775 0.9284689 0.9405665 1.0000000 0.9833488
X6 0.9534411 0.9374130 0.9827571 0.9445289 0.9729649 0.9833488 1.0000000
# Calcula o 25° percentil - 1 Quartil
quantile(minhavar, 0.25)
25%
34.45
# Calcula o 75° percentil - 3 Quartil
quantile(minhavar, 0.75)
75%
43.4
summary(dados_tbl)
YEAR Y X2 X3
Min. :1960 Min. :27.80 Min. : 397.5 Min. :37.30
1st Qu.:1966 1st Qu.:34.45 1st Qu.: 544.5 1st Qu.:38.95
Median :1971 Median :40.30 Median : 843.3 Median :40.30
Mean :1971 Mean :39.67 Mean :1035.1 Mean :48.00
3rd Qu.:1976 3rd Qu.:43.40 3rd Qu.:1399.5 3rd Qu.:58.10
Max. :1982 Max. :52.90 Max. :2478.7 Max. :70.40
X4 X5 X6
Min. : 50.7 Min. : 77.4 Min. : 65.80
1st Qu.: 64.1 1st Qu.: 80.3 1st Qu.: 74.95
Median : 73.2 Median :106.1 Median : 93.30
Mean : 90.4 Mean :124.4 Mean :107.86
3rd Qu.:125.8 3rd Qu.:143.2 3rd Qu.:134.40
Max. :168.2 Max. :232.6 Max. :189.40
skimr
library(skimr)
# Sumariza os dados com o pacote skimr
skim(dados_tbl)
Name | dados_tbl |
Number of rows | 23 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
numeric | 7 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
YEAR | 0 | 1 | 1971.00 | 6.78 | 1960.0 | 1965.50 | 1971.0 | 1976.50 | 1982.0 | ▇▆▇▆▇ |
Y | 0 | 1 | 39.67 | 7.37 | 27.8 | 34.45 | 40.3 | 43.40 | 52.9 | ▅▃▇▂▃ |
X2 | 0 | 1 | 1035.07 | 617.85 | 397.5 | 544.45 | 843.3 | 1399.50 | 2478.7 | ▇▃▂▂▂ |
X3 | 0 | 1 | 48.00 | 11.12 | 37.3 | 38.95 | 40.3 | 58.10 | 70.4 | ▇▁▁▃▁ |
X4 | 0 | 1 | 90.40 | 35.22 | 50.7 | 64.10 | 73.2 | 125.75 | 168.2 | ▇▂▁▃▁ |
X5 | 0 | 1 | 124.43 | 51.50 | 77.4 | 80.30 | 106.1 | 143.25 | 232.6 | ▇▂▂▁▂ |
X6 | 0 | 1 | 107.86 | 39.81 | 65.8 | 74.95 | 93.3 | 134.40 | 189.4 | ▇▂▃▁▂ |
import numpy as np
import pandas as pd
# Carrega os dados
# precisei instalar openpyxl (para xlsx) e xlrd (para xls)
df = pd.read_excel (r'gujarati 5ed p236 frangos tabela7_9.xlsx', sheet_name='py')
#data_as_dict= print(df.to_dict())
#df = pd.DataFrame.from_dict(data_as_dict)
import pandas as pd
dados = pd.DataFrame.from_dict({'YEAR': {0: 1960, 1: 1961, 2: 1962, 3: 1963, 4: 1964, 5: 1965, 6: 1966, 7: 1967, 8: 1968, 9: 1969, 10: 1970, 11: 1971, 12: 1972, 13: 1973, 14: 1974, 15: 1975, 16: 1976, 17: 1977, 18: 1978, 19: 1979, 20: 1980, 21: 1981, 22: 1982}, 'Y': {0: 27.8, 1: 29.9, 2: 29.8, 3: 30.8, 4: 31.2, 5: 33.3, 6: 35.6, 7: 36.4, 8: 36.7, 9: 38.4, 10: 40.4, 11: 40.3, 12: 41.8, 13: 40.4, 14: 40.7, 15: 40.1, 16: 42.7, 17: 44.1, 18: 46.7, 19: 50.6, 20: 50.1, 21: 51.7, 22: 52.9}, 'X2': {0: 397.5, 1: 413.3, 2: 439.2, 3: 459.7, 4: 492.9, 5: 528.6, 6: 560.3, 7: 624.6, 8: 666.4, 9: 717.8, 10: 768.2, 11: 843.3, 12: 911.6, 13: 931.1, 14: 1021.5, 15: 1165.9, 16: 1349.6, 17: 1449.4, 18: 1575.5, 19: 1759.1, 20: 1994.2, 21: 2258.1, 22: 2478.7}, 'X3': {0: 42.2, 1: 38.1, 2: 40.3, 3: 39.5, 4: 37.3, 5: 38.1, 6: 39.3, 7: 37.8, 8: 38.4, 9: 40.1, 10: 38.6, 11: 39.8, 12: 39.7, 13: 52.1, 14: 48.9, 15: 58.3, 16: 57.9, 17: 56.5, 18: 63.7, 19: 61.6, 20: 58.9, 21: 66.4, 22: 70.4}, 'X4': {0: 50.7, 1: 52.0, 2: 54.0, 3: 55.3, 4: 54.7, 5: 63.7, 6: 69.8, 7: 65.9, 8: 64.5, 9: 70.0, 10: 73.2, 11: 67.8, 12: 79.1, 13: 95.4, 14: 94.2, 15: 123.5, 16: 129.9, 17: 117.6, 18: 130.9, 19: 129.8, 20: 128.0, 21: 141.0, 22: 168.2}, 'X5': {0: 78.3, 1: 79.2, 2: 79.2, 3: 79.2, 4: 77.4, 5: 80.2, 6: 80.4, 7: 83.9, 8: 85.5, 9: 93.7, 10: 106.1, 11: 104.8, 12: 114.0, 13: 124.1, 14: 127.6, 15: 142.9, 16: 143.6, 17: 139.2, 18: 165.5, 19: 203.3, 20: 219.6, 21: 221.6, 22: 232.6}, 'X6': {0: 65.8, 1: 66.9, 2: 67.8, 3: 69.6, 4: 68.7, 5: 73.6, 6: 76.3, 7: 77.2, 8: 78.1, 9: 84.7, 10: 93.3, 11: 89.7, 12: 100.7, 13: 113.5, 14: 115.3, 15: 136.7, 16: 139.2, 17: 132.0, 18: 132.1, 19: 154.4, 20: 174.9, 21: 180.8, 22: 189.4}})
dados
# Ve a estrutura dos dados
YEAR Y X2 X3 X4 X5 X6
0 1960 27.8 397.5 42.2 50.7 78.3 65.8
1 1961 29.9 413.3 38.1 52.0 79.2 66.9
2 1962 29.8 439.2 40.3 54.0 79.2 67.8
3 1963 30.8 459.7 39.5 55.3 79.2 69.6
4 1964 31.2 492.9 37.3 54.7 77.4 68.7
5 1965 33.3 528.6 38.1 63.7 80.2 73.6
6 1966 35.6 560.3 39.3 69.8 80.4 76.3
7 1967 36.4 624.6 37.8 65.9 83.9 77.2
8 1968 36.7 666.4 38.4 64.5 85.5 78.1
9 1969 38.4 717.8 40.1 70.0 93.7 84.7
10 1970 40.4 768.2 38.6 73.2 106.1 93.3
11 1971 40.3 843.3 39.8 67.8 104.8 89.7
12 1972 41.8 911.6 39.7 79.1 114.0 100.7
13 1973 40.4 931.1 52.1 95.4 124.1 113.5
14 1974 40.7 1021.5 48.9 94.2 127.6 115.3
15 1975 40.1 1165.9 58.3 123.5 142.9 136.7
16 1976 42.7 1349.6 57.9 129.9 143.6 139.2
17 1977 44.1 1449.4 56.5 117.6 139.2 132.0
18 1978 46.7 1575.5 63.7 130.9 165.5 132.1
19 1979 50.6 1759.1 61.6 129.8 203.3 154.4
20 1980 50.1 1994.2 58.9 128.0 219.6 174.9
21 1981 51.7 2258.1 66.4 141.0 221.6 180.8
22 1982 52.9 2478.7 70.4 168.2 232.6 189.4
dados.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 23 entries, 0 to 22
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 YEAR 23 non-null int64
1 Y 23 non-null float64
2 X2 23 non-null float64
3 X3 23 non-null float64
4 X4 23 non-null float64
5 X5 23 non-null float64
6 X6 23 non-null float64
dtypes: float64(6), int64(1)
memory usage: 1.4 KB
# Média
dados['Y'].mean()
# Mediana
39.669565217391316
dados['Y'].median()
# Moda
40.3
dados['Y'].mode()
0 40.4
dtype: float64
# Variância
dados['Y'].var()
# Desvio padrão
54.36039525691701
dados['Y'].std()
7.372950241044423
# Covariância
dados.cov()
# Correlação
YEAR Y ... X5 X6
YEAR 46.000000 48.918182 ... 322.518182 257.463636
Y 48.918182 54.360395 ... 355.159150 275.179071
X2 3935.518182 4314.699348 ... 31369.597470 24175.210692
X3 66.554545 68.848498 ... 531.579229 418.074802
X4 226.272727 236.950455 ... 1706.197727 1364.505000
X5 322.518182 355.159150 ... 2652.223123 2016.302747
X6 257.463636 275.179071 ... 2016.302747 1585.208024
[7 rows x 7 columns]
dados.corr()
YEAR Y X2 X3 X4 X5 X6
YEAR 1.000000 0.978251 0.939165 0.882680 0.947149 0.923358 0.953441
Y 0.978251 1.000000 0.947171 0.839958 0.912392 0.935355 0.937413
X2 0.939165 0.947171 1.000000 0.931681 0.957131 0.985878 0.982757
X3 0.882680 0.839958 0.931681 1.000000 0.970112 0.928469 0.944529
X4 0.947149 0.912392 0.957131 0.970112 1.000000 0.940567 0.972965
X5 0.923358 0.935355 0.985878 0.928469 0.940567 1.000000 0.983349
X6 0.953441 0.937413 0.982757 0.944529 0.972965 0.983349 1.000000
import numpy as np
q1 = np.percentile(dados['Y'], 25) # 25th percentile, ex 1 Quartil.
q1
34.45
q3 = np.percentile(dados['Y'], 75) # 75th percentile, ex 3 Quartil.
q3
43.400000000000006
dados.describe()
YEAR Y X2 ... X4 X5 X6
count 23.00000 23.000000 23.000000 ... 23.000000 23.000000 23.000000
mean 1971.00000 39.669565 1035.065217 ... 90.400000 124.430435 107.856522
std 6.78233 7.372950 617.847020 ... 35.223688 51.499739 39.814671
min 1960.00000 27.800000 397.500000 ... 50.700000 77.400000 65.800000
25% 1965.50000 34.450000 544.450000 ... 64.100000 80.300000 74.950000
50% 1971.00000 40.300000 843.300000 ... 73.200000 106.100000 93.300000
75% 1976.50000 43.400000 1399.500000 ... 125.750000 143.250000 134.400000
max 1982.00000 52.900000 2478.700000 ... 168.200000 232.600000 189.400000
[8 rows x 7 columns]
skimpy
library(reticulate)
py_install("skimpy") # para instalar pacote
# Skim
from skimpy import skim
skim(dados)
GUJARATI, Damodar N.; PORTER, Dawn C. Econometria básica. 5.ed. Porto Alegre: AMGH/Bookman/McGraw-Hill do Brasil, 2011. Número de chamada: 330.015195 G969e.5 (CBC) e recurso online ISBN 9788580550511 (Minha Biblioteca - UFMS).