1 Importando os dados

library(readxl)
dados <- read_excel("C:/Users/user/Desktop/Vida acadêmica/Disciplinas/Economia da Inovação/Covid/Dados_vitor.xlsx")
attach(dados)
str(dados)
## tibble [80 x 9] (S3: tbl_df/tbl/data.frame)
##  $ Country            : chr [1:80] "Argentina" "Armenia" "Australia" "Austria" ...
##  $ IDH_2018           : num [1:80] 0.83 0.76 0.938 0.914 0.754 0.838 0.919 0.761 0.845 0.581 ...
##  $ GDP_Mean           : num [1:80] 12533 3991 55935 48414 5159 ...
##  $ GCI_2019           : num [1:80] 57.2 61.3 78.7 76.6 62.7 65.4 76.4 60.9 62.8 52.1 ...
##  $ ReD                : num [1:80] 0.579 0.228 1.897 3.097 0.202 ...
##  $ Patent_Resident    : num [1:80] 551 113 2432 2097 171 ...
##  $ Patent_Non_Resident: num [1:80] 3393.8 2.4 25931.8 228.8 19 ...
##  $ Education          : num [1:80] 5.54 2.63 5.25 5.47 2.74 ...
##  $ C_Death            : num [1:80] 7.326 24.905 0.741 7.972 4.349 ...

2 Normalidade da VD

hist(dados$C_Death, main = "Frequency distribution of deaths by Covid", xlab = "Deaths", ylab = "Frequency")

shapiro.test(dados$C_Death) # Avariável depedente (Mortes por covid) não apresentam distribuição normal
## 
##  Shapiro-Wilk normality test
## 
## data:  dados$C_Death
## W = 0.71673, p-value = 3.959e-11

3 Correlações

pairs(dados[,2:9])

library(corrplot)
Cor <- cor(dados[,2:9])
corrplot(Cor, tl.cex = 1, tl.col = "black")

corrplot(Cor, method = "number", tl.col = "black")

corrplot(Cor, type = "lower", method = "number", tl.col = "black")

library(Hmisc)
m <- rcorr(as.matrix(dados[,2:9]))
corrplot(m$r, p.mat=m$P, sig.level = 0.05, method = "number", type = "upper", tl.cex = 0.7) # Gráfico números eliminando as correlações não significativas 

library(ggplot2)
library(GGally)
ggpairs(dados[,2:9],lower = list(continuous="smooth"))

4 Regressão linear

4.1 Modelo

lm <- lm(dados$C_Death ~ IDH_2018 + GDP_Mean + GCI_2019 + ReD + Patent_Resident +
           Patent_Non_Resident + Education, data = dados)
summary(lm)
## 
## Call:
## lm(formula = dados$C_Death ~ IDH_2018 + GDP_Mean + GCI_2019 + 
##     ReD + Patent_Resident + Patent_Non_Resident + Education, 
##     data = dados)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.199 -11.330  -4.398   3.390  65.537 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)  
## (Intercept)         -4.442e+01  2.307e+01  -1.926   0.0581 .
## IDH_2018            -2.355e+01  4.191e+01  -0.562   0.5759  
## GDP_Mean            -5.604e-05  1.488e-04  -0.377   0.7076  
## GCI_2019             1.083e+00  6.211e-01   1.744   0.0854 .
## ReD                 -3.734e+00  3.345e+00  -1.116   0.2681  
## Patent_Resident     -3.011e-05  2.097e-05  -1.436   0.1554  
## Patent_Non_Resident  9.400e-05  7.174e-05   1.310   0.1943  
## Education            2.261e+00  1.702e+00   1.328   0.1884  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.35 on 72 degrees of freedom
## Multiple R-squared:  0.1622, Adjusted R-squared:  0.0808 
## F-statistic: 1.992 on 7 and 72 DF,  p-value: 0.0679

4.2 Coeficientes

library(QuantPsyc)
lm$coefficients # Coeficientes
##         (Intercept)            IDH_2018            GDP_Mean            GCI_2019 
##       -4.442085e+01       -2.354957e+01       -5.603577e-05        1.083124e+00 
##                 ReD     Patent_Resident Patent_Non_Resident           Education 
##       -3.733862e+00       -3.010688e-05        9.399697e-05        2.260721e+00
lm.beta(lm) # valores padronizados
##            IDH_2018            GDP_Mean            GCI_2019                 ReD 
##         -0.15090034         -0.06923041          0.61131308         -0.21121058 
##     Patent_Resident Patent_Non_Resident           Education 
##         -0.20886015          0.18630070          0.16255175
confint(lm) # intervalo de confiança dos coenficientes 
##                             2.5 %       97.5 %
## (Intercept)         -9.040253e+01 1.560835e+00
## IDH_2018            -1.071001e+02 6.000099e+01
## GDP_Mean            -3.526356e-04 2.405640e-04
## GCI_2019            -1.549829e-01 2.321230e+00
## ReD                 -1.040237e+01 2.934644e+00
## Patent_Resident     -7.190545e-05 1.169170e-05
## Patent_Non_Resident -4.901056e-05 2.370045e-04
## Education           -1.132963e+00 5.654404e+00

4.3 Plot

lm$residuals
##           1           2           3           4           5           6 
##  -0.6158385  15.9625128 -22.0062773  -7.0949703  -6.5354675  -2.0439180 
##           7           8           9          10          11          12 
##  65.5374126  28.3444598  -8.9911043  -2.4217595  27.5237924   2.8279407 
##          13          14          15          16          17          18 
##   3.9997343 -15.0924578 -16.2671136 -11.3256108 -13.6554724  24.3731256 
##          19          20          21          22          23          24 
##   2.1064784 -12.4189621  -1.8681193 -17.2806642  23.9757465  -9.6276421 
##          25          26          27          28          29          30 
##  -8.9188823  -4.8649909   6.0051669   0.1842984  -4.5891610 -17.2465585 
##          31          32          33          34          35          36 
## -14.4264104  20.9388294  19.9837995  -6.1104588  43.9622207 -13.4601212 
##          37          38          39          40          41          42 
##  -9.8214350  -5.5312019 -10.8892702   9.7922961 -14.9019874 -11.3447749 
##          43          44          45          46          47          48 
##   2.5801203   4.0195214 -23.0123335 -17.0395516 -14.9418710  16.2847615 
##          49          50          51          52          53          54 
##   7.1082291  -4.6831556   1.5844607  -4.4875519  11.0934065 -24.1990487 
##          55          56          57          58          59          60 
##  -3.1826909 -18.7440637  -0.6741112  44.5913252 -11.2276241  -0.1003061 
##          61          62          63          64          65          66 
## -10.3545826   1.3129562  -3.5911370  -5.6767582  -1.2963778 -20.9729167 
##          67          68          69          70          71          72 
## -12.4458691  -6.9687090  -4.3090441  41.3736797  -4.0662506  33.4134269 
##          73          74          75          76          77          78 
##   2.9436058 -11.2603565  -1.2262834  -6.0362063  41.6371685   3.1861072 
##          79          80 
## -12.8164341 -13.9827191
par(mfrow = c(2,2), mar = c(2,2,2,2)) # Residuals vs Fitted = Linearidade; Normal QQ = Distribuição normal; Scale-Location = Homocedasticidade; Residuals vs Leverage = Distância de Cook
plot(lm)

dev.off()
## null device 
##           1

4.4 ANOVA do modelo

anova(lm)
## Analysis of Variance Table
## 
## Response: dados$C_Death
##                     Df  Sum Sq Mean Sq F value   Pr(>F)   
## IDH_2018             1  2376.5 2376.49  7.0615 0.009695 **
## GDP_Mean             1   104.4  104.38  0.3102 0.579314   
## GCI_2019             1   494.3  494.30  1.4688 0.229500   
## ReD                  1   239.7  239.69  0.7122 0.401501   
## Patent_Resident      1   349.5  349.52  1.0386 0.311569   
## Patent_Non_Resident  1   534.8  534.80  1.5891 0.211521   
## Education            1   593.5  593.48  1.7635 0.188387   
## Residuals           72 24230.9  336.54                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.5 Distância d Cook

plot(lm, pch = 18, col = "red", which=c(4))

cooks.distance(lm)
##            1            2            3            4            5            6 
## 1.739266e-05 5.174170e-03 8.304085e-03 1.588041e-03 9.147227e-04 1.301692e-04 
##            7            8            9           10           11           12 
## 8.827911e-02 1.625913e-02 2.271749e-03 2.442759e-04 2.306426e-02 1.643778e+00 
##           13           14           15           16           17           18 
## 2.026764e-04 8.198949e-03 7.298900e-03 2.048853e-03 8.118922e-03 1.155358e-02 
##           19           20           21           22           23           24 
## 6.169139e-05 1.767849e-03 2.650445e-04 9.408810e-03 1.230771e-02 1.210185e-03 
##           25           26           27           28           29           30 
## 2.059482e-03 8.637009e-04 6.525838e-04 1.536184e-06 3.097149e-04 1.452828e-02 
##           31           32           33           34           35           36 
## 8.314030e-03 3.616888e-02 1.980202e-02 5.847148e-03 2.129952e-02 1.106687e-02 
##           37           38           39           40           41           42 
## 1.152850e-03 7.130594e-04 1.587692e-02 2.407669e-03 3.137382e-03 1.633543e-03 
##           43           44           45           46           47           48 
## 2.333932e-03 9.071848e-04 3.199061e-02 6.951133e-03 2.878797e-03 5.265185e-03 
##           49           50           51           52           53           54 
## 8.666217e-04 5.558138e-04 3.545048e-04 3.578844e-04 3.373062e-03 1.425557e-02 
##           55           56           57           58           59           60 
## 1.497232e-04 2.801132e-02 1.795060e-05 2.489778e-02 1.743313e-03 8.961438e-08 
##           61           62           63           64           65           66 
## 8.521211e-03 3.026927e-05 1.574824e-04 1.826434e-03 2.737389e-05 2.982179e-02 
##           67           68           69           70           71           72 
## 1.707012e-03 1.097238e-03 5.480136e-04 3.152811e-02 5.510704e-04 5.091081e-02 
##           73           74           75           76           77           78 
## 6.056768e-04 3.694802e-03 7.628647e-05 5.781123e-04 5.223902e-02 3.549574e+00 
##           79           80 
## 1.602004e-03 4.070272e-03
meuCD <- cooks.distance(lm)
mecdr <- round(meuCD,5)
sort(mecdr) # 15 é problemático
##      28      60       1      57      62      65      19      75       6      55 
## 0.00000 0.00000 0.00002 0.00002 0.00003 0.00003 0.00006 0.00008 0.00013 0.00015 
##      63      13      10      21      29      51      52      69      71      50 
## 0.00016 0.00020 0.00024 0.00027 0.00031 0.00035 0.00036 0.00055 0.00055 0.00056 
##      76      73      27      38      26      49       5      44      68      37 
## 0.00058 0.00061 0.00065 0.00071 0.00086 0.00087 0.00091 0.00091 0.00110 0.00115 
##      24       4      79      42      67      59      20      64      16      25 
## 0.00121 0.00159 0.00160 0.00163 0.00171 0.00174 0.00177 0.00183 0.00205 0.00206 
##       9      43      40      47      41      53      74      80       2      48 
## 0.00227 0.00233 0.00241 0.00288 0.00314 0.00337 0.00369 0.00407 0.00517 0.00527 
##      34      46      15      17      14       3      31      61      22      36 
## 0.00585 0.00695 0.00730 0.00812 0.00820 0.00830 0.00831 0.00852 0.00941 0.01107 
##      18      23      54      30      39       8      33      35      11      58 
## 0.01155 0.01231 0.01426 0.01453 0.01588 0.01626 0.01980 0.02130 0.02306 0.02490 
##      56      66      70      45      32      72      77       7      12      78 
## 0.02801 0.02982 0.03153 0.03199 0.03617 0.05091 0.05224 0.08828 1.64378 3.54957
dados$cooks.distance <- cooks.distance(lm)
library(plotly)
library(magrittr)
plot_ly(data = dados, y = dados$cooks.distance, type = "scatter") %>% layout(title = "Distância de Cook") # O valor mostrado é sempre +1 na base de dados
library(car)
influenceIndexPlot(lm, id.n = 5) # 15 e 23 são problemáticos, mas são dados reais. Cabe ao grupo definir se remove ou não da base

4.6 Calcular a correlação entre as VI’s (multicolineariedade)

library(car)
vif(lm) # Ponto de corte: VIF até 10. Existe multicolinearidade
##            IDH_2018            GDP_Mean            GCI_2019                 ReD 
##            6.198854            2.904043           10.560524            3.077302 
##     Patent_Resident Patent_Non_Resident           Education 
##            1.818448            1.737470            1.287746
mean(vif(lm))
## [1] 3.940627
1/vif(lm) # Tolerância: Ponto de corte: valores menores do que 0.2
##            IDH_2018            GDP_Mean            GCI_2019                 ReD 
##          0.16132015          0.34434752          0.09469227          0.32495998 
##     Patent_Resident Patent_Non_Resident           Education 
##          0.54991951          0.57554960          0.77655036

4.7 Durbin-Watson (autocorrelação dos resíduos)

library(lmtest) # Ok
dwtest(lm)
## 
##  Durbin-Watson test
## 
## data:  lm
## DW = 1.9572, p-value = 0.3401
## alternative hypothesis: true autocorrelation is greater than 0
library(car)
durbinWatsonTest(lm) #H0 resíduos são aleatórios
##  lag Autocorrelation D-W Statistic p-value
##    1       0.0173574      1.957201   0.842
##  Alternative hypothesis: rho != 0

4.8 Resíduos

res <- rstandard(lm(dados$C_Death ~ IDH_2018 + GDP_Mean + GCI_2019 + ReD + Patent_Resident +
                      Patent_Non_Resident + Education, data = dados))
res <- as.data.frame(res)                
library(car)
residualPlots(lm)

##                     Test stat Pr(>|Test stat|)
## IDH_2018              -0.1996           0.8424
## GDP_Mean              -0.4166           0.6783
## GCI_2019               0.7431           0.4599
## ReD                   -0.5033           0.6163
## Patent_Resident        0.7025           0.4847
## Patent_Non_Resident    0.9529           0.3439
## Education             -0.7254           0.4706
## Tukey test             1.1918           0.2334

4.9 Outliers dos resíudos

dados$Zscoreresiduos <- res
dados$Zscoreresiduos
##             res
## 1  -0.035385789
## 2   0.892451010
## 3  -1.225806978
## 4  -0.401688475
## 5  -0.365861010
## 6  -0.115669864
## 7   3.665189510
## 8   1.584588830
## 9  -0.507132198
## 10 -0.138566805
## 11  1.556429546
## 12  0.755655927
## 13  0.221598319
## 14 -0.858525436
## 15 -0.917001127
## 16 -0.629985542
## 17 -0.782822665
## 18  1.361323273
## 19  0.116881189
## 20 -0.687032480
## 21 -0.110343559
## 22 -0.978319341
## 23  1.342174771
## 24 -0.533654865
## 25 -0.501825858
## 26 -0.276886389
## 27  0.334878313
## 28  0.010583047
## 29 -0.254883803
## 30 -0.993890275
## 31 -0.824009547
## 32  1.243589503
## 33  1.152464172
## 34 -0.382616771
## 35  2.430718675
## 36 -0.784697348
## 37 -0.543661273
## 38 -0.310311625
## 39 -0.671911744
## 40  0.550486942
## 41 -0.827084798
## 42 -0.628555408
## 43  0.177494005
## 44  0.233261886
## 45 -1.340742792
## 46 -0.956640620
## 47 -0.828056340
## 48  0.909989486
## 49  0.395948881
## 50 -0.263338883
## 51  0.098243210
## 52 -0.250152775
## 53  0.625231602
## 54 -1.359210430
## 55 -0.176783838
## 56 -1.110682748
## 57 -0.038486331
## 58  2.470059568
## 59 -0.622926248
## 60 -0.005531437
## 61 -0.613433792
## 62  0.073170697
## 63 -0.198849277
## 64 -0.329597634
## 65 -0.072137954
## 66 -1.230079055
## 67 -0.688145018
## 68 -0.390641403
## 69 -0.243423081
## 70  2.308078313
## 71 -0.230654830
## 72  1.919425401
## 73  0.172964544
## 74 -0.635850876
## 75 -0.070798082
## 76 -0.335720460
## 77  2.353718914
## 78  0.969897416
## 79 -0.707518676
## 80 -0.782225149
round(dados$Zscoreresiduos, digits = 3)
##       res
## 1  -0.035
## 2   0.892
## 3  -1.226
## 4  -0.402
## 5  -0.366
## 6  -0.116
## 7   3.665
## 8   1.585
## 9  -0.507
## 10 -0.139
## 11  1.556
## 12  0.756
## 13  0.222
## 14 -0.859
## 15 -0.917
## 16 -0.630
## 17 -0.783
## 18  1.361
## 19  0.117
## 20 -0.687
## 21 -0.110
## 22 -0.978
## 23  1.342
## 24 -0.534
## 25 -0.502
## 26 -0.277
## 27  0.335
## 28  0.011
## 29 -0.255
## 30 -0.994
## 31 -0.824
## 32  1.244
## 33  1.152
## 34 -0.383
## 35  2.431
## 36 -0.785
## 37 -0.544
## 38 -0.310
## 39 -0.672
## 40  0.550
## 41 -0.827
## 42 -0.629
## 43  0.177
## 44  0.233
## 45 -1.341
## 46 -0.957
## 47 -0.828
## 48  0.910
## 49  0.396
## 50 -0.263
## 51  0.098
## 52 -0.250
## 53  0.625
## 54 -1.359
## 55 -0.177
## 56 -1.111
## 57 -0.038
## 58  2.470
## 59 -0.623
## 60 -0.006
## 61 -0.613
## 62  0.073
## 63 -0.199
## 64 -0.330
## 65 -0.072
## 66 -1.230
## 67 -0.688
## 68 -0.391
## 69 -0.243
## 70  2.308
## 71 -0.231
## 72  1.919
## 73  0.173
## 74 -0.636
## 75 -0.071
## 76 -0.336
## 77  2.354
## 78  0.970
## 79 -0.708
## 80 -0.782
dados$large.residuals <- dados$Zscoreresiduos > 2.9 | dados$Zscoreresiduos < - 2.9
summary(dados$large.residuals) # 1 outlier: observação 5
##     res         
##  Mode :logical  
##  FALSE:79       
##  TRUE :1

4.10 Normalidade para os resíduos (Shapiro-Wilk)

res <- rstandard(lm(dados$C_Death ~ IDH_2018 + GDP_Mean + GCI_2019 + ReD + Patent_Resident +
                      Patent_Non_Resident + Education, data = dados))
shapiro.test(res)
## 
##  Shapiro-Wilk normality test
## 
## data:  res
## W = 0.87877, p-value = 1.721e-06
hist(res, breaks = 15, freq = T, col = "lightgray", main = "Distribuição dos resíduos", xlab = "Escore")

qq <- qplot(sample = res)
ggplotly(qq) # resíduos não possuem distribuição normal

4.11 Teste de variância - homocedasticidade (Breush-Pagan)

library(ggplot2)
dados$fitted <- lm$fitted.values
dados$standr <- lm$residuals
dados[,c("C_Death", "fitted")]
## # A tibble: 80 x 2
##    C_Death fitted
##      <dbl>  <dbl>
##  1   7.33    7.94
##  2  24.9     8.94
##  3   0.741  22.7 
##  4   7.97   15.1 
##  5   4.35   10.9 
##  6   8.58   10.6 
##  7  84.9    19.4 
##  8  42.4    14.1 
##  9   0.686   9.68
## 10   0       2.42
## # ... with 70 more rows
ggplot(data=dados, aes(fitted, standr)) + geom_point() + geom_smooth(color = "blue") + labs(x= "Valores ajustados", y = "Resíduos padronizados")

ggplot(data=dados, aes(fitted,standr)) + geom_point() + geom_smooth(method = "lm", color = "blue") + labs(x= "Valores ajustados", y = "Resíduos padronizados")

library(car)
ncvTest(lm(dados$C_Death ~ IDH_2018 + GDP_Mean + GCI_2019 + ReD + Patent_Resident +
             Patent_Non_Resident + Education, data = dados)) # Dados possuem uma distribuição heterocedástica
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 13.58572, Df = 1, p = 0.00022791