Correlações
pairs(dados[,2:9])

library(corrplot)
Cor <- cor(dados[,2:9])
corrplot(Cor, tl.cex = 1, tl.col = "black")

corrplot(Cor, method = "number", tl.col = "black")

corrplot(Cor, type = "lower", method = "number", tl.col = "black")

library(Hmisc)
m <- rcorr(as.matrix(dados[,2:9]))
corrplot(m$r, p.mat=m$P, sig.level = 0.05, method = "number", type = "upper", tl.cex = 0.7) # Gráfico números eliminando as correlações não significativas

library(ggplot2)
library(GGally)
ggpairs(dados[,2:9],lower = list(continuous="smooth"))

Regressão linear
Modelo
lm <- lm(dados$C_Death ~ IDH_2018 + GDP_Mean + GCI_2019 + ReD + Patent_Resident +
Patent_Non_Resident + Education, data = dados)
summary(lm)
##
## Call:
## lm(formula = dados$C_Death ~ IDH_2018 + GDP_Mean + GCI_2019 +
## ReD + Patent_Resident + Patent_Non_Resident + Education,
## data = dados)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.199 -11.330 -4.398 3.390 65.537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.442e+01 2.307e+01 -1.926 0.0581 .
## IDH_2018 -2.355e+01 4.191e+01 -0.562 0.5759
## GDP_Mean -5.604e-05 1.488e-04 -0.377 0.7076
## GCI_2019 1.083e+00 6.211e-01 1.744 0.0854 .
## ReD -3.734e+00 3.345e+00 -1.116 0.2681
## Patent_Resident -3.011e-05 2.097e-05 -1.436 0.1554
## Patent_Non_Resident 9.400e-05 7.174e-05 1.310 0.1943
## Education 2.261e+00 1.702e+00 1.328 0.1884
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.35 on 72 degrees of freedom
## Multiple R-squared: 0.1622, Adjusted R-squared: 0.0808
## F-statistic: 1.992 on 7 and 72 DF, p-value: 0.0679
Coeficientes
library(QuantPsyc)
lm$coefficients # Coeficientes
## (Intercept) IDH_2018 GDP_Mean GCI_2019
## -4.442085e+01 -2.354957e+01 -5.603577e-05 1.083124e+00
## ReD Patent_Resident Patent_Non_Resident Education
## -3.733862e+00 -3.010688e-05 9.399697e-05 2.260721e+00
lm.beta(lm) # valores padronizados
## IDH_2018 GDP_Mean GCI_2019 ReD
## -0.15090034 -0.06923041 0.61131308 -0.21121058
## Patent_Resident Patent_Non_Resident Education
## -0.20886015 0.18630070 0.16255175
confint(lm) # intervalo de confiança dos coenficientes
## 2.5 % 97.5 %
## (Intercept) -9.040253e+01 1.560835e+00
## IDH_2018 -1.071001e+02 6.000099e+01
## GDP_Mean -3.526356e-04 2.405640e-04
## GCI_2019 -1.549829e-01 2.321230e+00
## ReD -1.040237e+01 2.934644e+00
## Patent_Resident -7.190545e-05 1.169170e-05
## Patent_Non_Resident -4.901056e-05 2.370045e-04
## Education -1.132963e+00 5.654404e+00
Plot
lm$residuals
## 1 2 3 4 5 6
## -0.6158385 15.9625128 -22.0062773 -7.0949703 -6.5354675 -2.0439180
## 7 8 9 10 11 12
## 65.5374126 28.3444598 -8.9911043 -2.4217595 27.5237924 2.8279407
## 13 14 15 16 17 18
## 3.9997343 -15.0924578 -16.2671136 -11.3256108 -13.6554724 24.3731256
## 19 20 21 22 23 24
## 2.1064784 -12.4189621 -1.8681193 -17.2806642 23.9757465 -9.6276421
## 25 26 27 28 29 30
## -8.9188823 -4.8649909 6.0051669 0.1842984 -4.5891610 -17.2465585
## 31 32 33 34 35 36
## -14.4264104 20.9388294 19.9837995 -6.1104588 43.9622207 -13.4601212
## 37 38 39 40 41 42
## -9.8214350 -5.5312019 -10.8892702 9.7922961 -14.9019874 -11.3447749
## 43 44 45 46 47 48
## 2.5801203 4.0195214 -23.0123335 -17.0395516 -14.9418710 16.2847615
## 49 50 51 52 53 54
## 7.1082291 -4.6831556 1.5844607 -4.4875519 11.0934065 -24.1990487
## 55 56 57 58 59 60
## -3.1826909 -18.7440637 -0.6741112 44.5913252 -11.2276241 -0.1003061
## 61 62 63 64 65 66
## -10.3545826 1.3129562 -3.5911370 -5.6767582 -1.2963778 -20.9729167
## 67 68 69 70 71 72
## -12.4458691 -6.9687090 -4.3090441 41.3736797 -4.0662506 33.4134269
## 73 74 75 76 77 78
## 2.9436058 -11.2603565 -1.2262834 -6.0362063 41.6371685 3.1861072
## 79 80
## -12.8164341 -13.9827191
par(mfrow = c(2,2), mar = c(2,2,2,2)) # Residuals vs Fitted = Linearidade; Normal QQ = Distribuição normal; Scale-Location = Homocedasticidade; Residuals vs Leverage = Distância de Cook
plot(lm)

dev.off()
## null device
## 1
ANOVA do modelo
anova(lm)
## Analysis of Variance Table
##
## Response: dados$C_Death
## Df Sum Sq Mean Sq F value Pr(>F)
## IDH_2018 1 2376.5 2376.49 7.0615 0.009695 **
## GDP_Mean 1 104.4 104.38 0.3102 0.579314
## GCI_2019 1 494.3 494.30 1.4688 0.229500
## ReD 1 239.7 239.69 0.7122 0.401501
## Patent_Resident 1 349.5 349.52 1.0386 0.311569
## Patent_Non_Resident 1 534.8 534.80 1.5891 0.211521
## Education 1 593.5 593.48 1.7635 0.188387
## Residuals 72 24230.9 336.54
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Distância d Cook
plot(lm, pch = 18, col = "red", which=c(4))

cooks.distance(lm)
## 1 2 3 4 5 6
## 1.739266e-05 5.174170e-03 8.304085e-03 1.588041e-03 9.147227e-04 1.301692e-04
## 7 8 9 10 11 12
## 8.827911e-02 1.625913e-02 2.271749e-03 2.442759e-04 2.306426e-02 1.643778e+00
## 13 14 15 16 17 18
## 2.026764e-04 8.198949e-03 7.298900e-03 2.048853e-03 8.118922e-03 1.155358e-02
## 19 20 21 22 23 24
## 6.169139e-05 1.767849e-03 2.650445e-04 9.408810e-03 1.230771e-02 1.210185e-03
## 25 26 27 28 29 30
## 2.059482e-03 8.637009e-04 6.525838e-04 1.536184e-06 3.097149e-04 1.452828e-02
## 31 32 33 34 35 36
## 8.314030e-03 3.616888e-02 1.980202e-02 5.847148e-03 2.129952e-02 1.106687e-02
## 37 38 39 40 41 42
## 1.152850e-03 7.130594e-04 1.587692e-02 2.407669e-03 3.137382e-03 1.633543e-03
## 43 44 45 46 47 48
## 2.333932e-03 9.071848e-04 3.199061e-02 6.951133e-03 2.878797e-03 5.265185e-03
## 49 50 51 52 53 54
## 8.666217e-04 5.558138e-04 3.545048e-04 3.578844e-04 3.373062e-03 1.425557e-02
## 55 56 57 58 59 60
## 1.497232e-04 2.801132e-02 1.795060e-05 2.489778e-02 1.743313e-03 8.961438e-08
## 61 62 63 64 65 66
## 8.521211e-03 3.026927e-05 1.574824e-04 1.826434e-03 2.737389e-05 2.982179e-02
## 67 68 69 70 71 72
## 1.707012e-03 1.097238e-03 5.480136e-04 3.152811e-02 5.510704e-04 5.091081e-02
## 73 74 75 76 77 78
## 6.056768e-04 3.694802e-03 7.628647e-05 5.781123e-04 5.223902e-02 3.549574e+00
## 79 80
## 1.602004e-03 4.070272e-03
meuCD <- cooks.distance(lm)
mecdr <- round(meuCD,5)
sort(mecdr) # 15 é problemático
## 28 60 1 57 62 65 19 75 6 55
## 0.00000 0.00000 0.00002 0.00002 0.00003 0.00003 0.00006 0.00008 0.00013 0.00015
## 63 13 10 21 29 51 52 69 71 50
## 0.00016 0.00020 0.00024 0.00027 0.00031 0.00035 0.00036 0.00055 0.00055 0.00056
## 76 73 27 38 26 49 5 44 68 37
## 0.00058 0.00061 0.00065 0.00071 0.00086 0.00087 0.00091 0.00091 0.00110 0.00115
## 24 4 79 42 67 59 20 64 16 25
## 0.00121 0.00159 0.00160 0.00163 0.00171 0.00174 0.00177 0.00183 0.00205 0.00206
## 9 43 40 47 41 53 74 80 2 48
## 0.00227 0.00233 0.00241 0.00288 0.00314 0.00337 0.00369 0.00407 0.00517 0.00527
## 34 46 15 17 14 3 31 61 22 36
## 0.00585 0.00695 0.00730 0.00812 0.00820 0.00830 0.00831 0.00852 0.00941 0.01107
## 18 23 54 30 39 8 33 35 11 58
## 0.01155 0.01231 0.01426 0.01453 0.01588 0.01626 0.01980 0.02130 0.02306 0.02490
## 56 66 70 45 32 72 77 7 12 78
## 0.02801 0.02982 0.03153 0.03199 0.03617 0.05091 0.05224 0.08828 1.64378 3.54957
dados$cooks.distance <- cooks.distance(lm)
library(plotly)
library(magrittr)
plot_ly(data = dados, y = dados$cooks.distance, type = "scatter") %>% layout(title = "Distância de Cook") # O valor mostrado é sempre +1 na base de dados
library(car)
influenceIndexPlot(lm, id.n = 5) # 15 e 23 são problemáticos, mas são dados reais. Cabe ao grupo definir se remove ou não da base

Calcular a correlação entre as VI’s (multicolineariedade)
library(car)
vif(lm) # Ponto de corte: VIF até 10. Existe multicolinearidade
## IDH_2018 GDP_Mean GCI_2019 ReD
## 6.198854 2.904043 10.560524 3.077302
## Patent_Resident Patent_Non_Resident Education
## 1.818448 1.737470 1.287746
mean(vif(lm))
## [1] 3.940627
1/vif(lm) # Tolerância: Ponto de corte: valores menores do que 0.2
## IDH_2018 GDP_Mean GCI_2019 ReD
## 0.16132015 0.34434752 0.09469227 0.32495998
## Patent_Resident Patent_Non_Resident Education
## 0.54991951 0.57554960 0.77655036
Durbin-Watson (autocorrelação dos resíduos)
library(lmtest) # Ok
dwtest(lm)
##
## Durbin-Watson test
##
## data: lm
## DW = 1.9572, p-value = 0.3401
## alternative hypothesis: true autocorrelation is greater than 0
library(car)
durbinWatsonTest(lm) #H0 resíduos são aleatórios
## lag Autocorrelation D-W Statistic p-value
## 1 0.0173574 1.957201 0.842
## Alternative hypothesis: rho != 0
Resíduos
res <- rstandard(lm(dados$C_Death ~ IDH_2018 + GDP_Mean + GCI_2019 + ReD + Patent_Resident +
Patent_Non_Resident + Education, data = dados))
res <- as.data.frame(res)
library(car)
residualPlots(lm)

## Test stat Pr(>|Test stat|)
## IDH_2018 -0.1996 0.8424
## GDP_Mean -0.4166 0.6783
## GCI_2019 0.7431 0.4599
## ReD -0.5033 0.6163
## Patent_Resident 0.7025 0.4847
## Patent_Non_Resident 0.9529 0.3439
## Education -0.7254 0.4706
## Tukey test 1.1918 0.2334
Outliers dos resíudos
dados$Zscoreresiduos <- res
dados$Zscoreresiduos
## res
## 1 -0.035385789
## 2 0.892451010
## 3 -1.225806978
## 4 -0.401688475
## 5 -0.365861010
## 6 -0.115669864
## 7 3.665189510
## 8 1.584588830
## 9 -0.507132198
## 10 -0.138566805
## 11 1.556429546
## 12 0.755655927
## 13 0.221598319
## 14 -0.858525436
## 15 -0.917001127
## 16 -0.629985542
## 17 -0.782822665
## 18 1.361323273
## 19 0.116881189
## 20 -0.687032480
## 21 -0.110343559
## 22 -0.978319341
## 23 1.342174771
## 24 -0.533654865
## 25 -0.501825858
## 26 -0.276886389
## 27 0.334878313
## 28 0.010583047
## 29 -0.254883803
## 30 -0.993890275
## 31 -0.824009547
## 32 1.243589503
## 33 1.152464172
## 34 -0.382616771
## 35 2.430718675
## 36 -0.784697348
## 37 -0.543661273
## 38 -0.310311625
## 39 -0.671911744
## 40 0.550486942
## 41 -0.827084798
## 42 -0.628555408
## 43 0.177494005
## 44 0.233261886
## 45 -1.340742792
## 46 -0.956640620
## 47 -0.828056340
## 48 0.909989486
## 49 0.395948881
## 50 -0.263338883
## 51 0.098243210
## 52 -0.250152775
## 53 0.625231602
## 54 -1.359210430
## 55 -0.176783838
## 56 -1.110682748
## 57 -0.038486331
## 58 2.470059568
## 59 -0.622926248
## 60 -0.005531437
## 61 -0.613433792
## 62 0.073170697
## 63 -0.198849277
## 64 -0.329597634
## 65 -0.072137954
## 66 -1.230079055
## 67 -0.688145018
## 68 -0.390641403
## 69 -0.243423081
## 70 2.308078313
## 71 -0.230654830
## 72 1.919425401
## 73 0.172964544
## 74 -0.635850876
## 75 -0.070798082
## 76 -0.335720460
## 77 2.353718914
## 78 0.969897416
## 79 -0.707518676
## 80 -0.782225149
round(dados$Zscoreresiduos, digits = 3)
## res
## 1 -0.035
## 2 0.892
## 3 -1.226
## 4 -0.402
## 5 -0.366
## 6 -0.116
## 7 3.665
## 8 1.585
## 9 -0.507
## 10 -0.139
## 11 1.556
## 12 0.756
## 13 0.222
## 14 -0.859
## 15 -0.917
## 16 -0.630
## 17 -0.783
## 18 1.361
## 19 0.117
## 20 -0.687
## 21 -0.110
## 22 -0.978
## 23 1.342
## 24 -0.534
## 25 -0.502
## 26 -0.277
## 27 0.335
## 28 0.011
## 29 -0.255
## 30 -0.994
## 31 -0.824
## 32 1.244
## 33 1.152
## 34 -0.383
## 35 2.431
## 36 -0.785
## 37 -0.544
## 38 -0.310
## 39 -0.672
## 40 0.550
## 41 -0.827
## 42 -0.629
## 43 0.177
## 44 0.233
## 45 -1.341
## 46 -0.957
## 47 -0.828
## 48 0.910
## 49 0.396
## 50 -0.263
## 51 0.098
## 52 -0.250
## 53 0.625
## 54 -1.359
## 55 -0.177
## 56 -1.111
## 57 -0.038
## 58 2.470
## 59 -0.623
## 60 -0.006
## 61 -0.613
## 62 0.073
## 63 -0.199
## 64 -0.330
## 65 -0.072
## 66 -1.230
## 67 -0.688
## 68 -0.391
## 69 -0.243
## 70 2.308
## 71 -0.231
## 72 1.919
## 73 0.173
## 74 -0.636
## 75 -0.071
## 76 -0.336
## 77 2.354
## 78 0.970
## 79 -0.708
## 80 -0.782
dados$large.residuals <- dados$Zscoreresiduos > 2.9 | dados$Zscoreresiduos < - 2.9
summary(dados$large.residuals) # 1 outlier: observação 5
## res
## Mode :logical
## FALSE:79
## TRUE :1
Normalidade para os resíduos (Shapiro-Wilk)
res <- rstandard(lm(dados$C_Death ~ IDH_2018 + GDP_Mean + GCI_2019 + ReD + Patent_Resident +
Patent_Non_Resident + Education, data = dados))
shapiro.test(res)
##
## Shapiro-Wilk normality test
##
## data: res
## W = 0.87877, p-value = 1.721e-06
hist(res, breaks = 15, freq = T, col = "lightgray", main = "Distribuição dos resíduos", xlab = "Escore")

qq <- qplot(sample = res)
ggplotly(qq) # resíduos não possuem distribuição normal
Teste de variância - homocedasticidade (Breush-Pagan)
library(ggplot2)
dados$fitted <- lm$fitted.values
dados$standr <- lm$residuals
dados[,c("C_Death", "fitted")]
## # A tibble: 80 x 2
## C_Death fitted
## <dbl> <dbl>
## 1 7.33 7.94
## 2 24.9 8.94
## 3 0.741 22.7
## 4 7.97 15.1
## 5 4.35 10.9
## 6 8.58 10.6
## 7 84.9 19.4
## 8 42.4 14.1
## 9 0.686 9.68
## 10 0 2.42
## # ... with 70 more rows
ggplot(data=dados, aes(fitted, standr)) + geom_point() + geom_smooth(color = "blue") + labs(x= "Valores ajustados", y = "Resíduos padronizados")

ggplot(data=dados, aes(fitted,standr)) + geom_point() + geom_smooth(method = "lm", color = "blue") + labs(x= "Valores ajustados", y = "Resíduos padronizados")

library(car)
ncvTest(lm(dados$C_Death ~ IDH_2018 + GDP_Mean + GCI_2019 + ReD + Patent_Resident +
Patent_Non_Resident + Education, data = dados)) # Dados possuem uma distribuição heterocedástica
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 13.58572, Df = 1, p = 0.00022791