Correlação
Importando os dados
library(foreign) ## Pacote Foreing é imcompatível com tidyverse - lê factor do spss ##
dados<- read.spss("C:/Users/user/Desktop/Vida acadêmica/Disciplinas/Estatística Vitor/Aula 3/Dados/Regressao.sav")
attach(dados)
library(tibble) # Resolve o problema anterior #
dados <- as_tibble(dados)
Teste de normalidade das variáveis
shapiro.test(dados$Gastos_saude)
##
## Shapiro-Wilk normality test
##
## data: dados$Gastos_saude
## W = 0.99104, p-value = 0.6297
shapiro.test(dados$Gastos_lazer)
##
## Shapiro-Wilk normality test
##
## data: dados$Gastos_lazer
## W = 0.98349, p-value = 0.1499
shapiro.test(dados$Gastos_educacao)
##
## Shapiro-Wilk normality test
##
## data: dados$Gastos_educacao
## W = 0.98292, p-value = 0.1328
Correlação Simples
cor.test(dados$Gastos_educacao, dados$Gastos_saude)
##
## Pearson's product-moment correlation
##
## data: dados$Gastos_educacao and dados$Gastos_saude
## t = 13.463, df = 118, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6961127 0.8402773
## sample estimates:
## cor
## 0.77825
cor.test(dados$Gastos_educacao, dados$Gastos_lazer)
##
## Pearson's product-moment correlation
##
## data: dados$Gastos_educacao and dados$Gastos_lazer
## t = 12.228, df = 118, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6563268 0.8173331
## sample estimates:
## cor
## 0.7476177
cor.test(dados$Gastos_saude, dados$Gastos_lazer)
##
## Pearson's product-moment correlation
##
## data: dados$Gastos_saude and dados$Gastos_lazer
## t = 13.634, df = 118, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7011641 0.8431539
## sample estimates:
## cor
## 0.7821115
cor(dados[,7:9])
## Gastos_saude Gastos_lazer Gastos_educacao
## Gastos_saude 1.0000000 0.7821115 0.7782500
## Gastos_lazer 0.7821115 1.0000000 0.7476177
## Gastos_educacao 0.7782500 0.7476177 1.0000000
Como determinar o método de correlação
cor.test(dados$Gastos_educacao, dados$Gastos_saude) # Pearson
##
## Pearson's product-moment correlation
##
## data: dados$Gastos_educacao and dados$Gastos_saude
## t = 13.463, df = 118, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6961127 0.8402773
## sample estimates:
## cor
## 0.77825
cor.test(dados$Gastos_educacao, dados$Gastos_saude, exact = F, method = "spearman")
##
## Spearman's rank correlation rho
##
## data: dados$Gastos_educacao and dados$Gastos_saude
## S = 78681, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7267847
cor.test(dados$Gastos_educacao, dados$Gastos_saude, exact = F, method = "kendal")
##
## Kendall's rank correlation tau
##
## data: dados$Gastos_educacao and dados$Gastos_saude
## z = 8.8553, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.5469319
Modelos de visualização das correlações
pairs(dados[,7:9])

Pelo pacote corrplot
library(corrplot)
Cor <- cor(dados[,7:9])
corrplot(Cor, tl.cex = 1)

corrplot(Cor, method = "color") # Cores

corrplot(Cor, method = "ellipse")

corrplot(Cor, method = "shade")

corrplot(Cor, method = "number")

corrplot(Cor, type = "upper")

corrplot(Cor, type = "lower", method = "number")

Pelo Hmisc
library(Hmisc)
m <- rcorr(as.matrix(dados[,7:9]))
m
## Gastos_saude Gastos_lazer Gastos_educacao
## Gastos_saude 1.00 0.78 0.78
## Gastos_lazer 0.78 1.00 0.75
## Gastos_educacao 0.78 0.75 1.00
##
## n= 120
##
##
## P
## Gastos_saude Gastos_lazer Gastos_educacao
## Gastos_saude 0 0
## Gastos_lazer 0 0
## Gastos_educacao 0 0
m$r # matriz de coeficiente de correlação
## Gastos_saude Gastos_lazer Gastos_educacao
## Gastos_saude 1.0000000 0.7821115 0.7782500
## Gastos_lazer 0.7821115 1.0000000 0.7476177
## Gastos_educacao 0.7782500 0.7476177 1.0000000
m$P # matriz de valor p
## Gastos_saude Gastos_lazer Gastos_educacao
## Gastos_saude NA 0 0
## Gastos_lazer 0 NA 0
## Gastos_educacao 0 0 NA
m$n # matriz de n
## Gastos_saude Gastos_lazer Gastos_educacao
## Gastos_saude 120 120 120
## Gastos_lazer 120 120 120
## Gastos_educacao 120 120 120
corrplot(m$r, p.mat=m$P, sig.level = 0.05) # Gráfico de bolinha eliminando as correlações não significativas

corrplot(m$r, p.mat=m$P, sig.level = 0.05, method = "number", type = "upper") # Gráfico números eliminando as correlações não significativas

Pelo GGally
library(ggplot2)
library(GGally)
ggcorr(dados, digits = 2, label = T)

ggpairs(dados[,7:9],lower = list(continuous="smooth"))

dados2 <- dados[,c(3,2,6,8,9,10,11)]
ggpairs(dados2 ,columns=2:7,aes(colour=Profissao))

Pelo corrgram
library(corrgram)
corrgram(dados)

corrgram(dados,lower.panel = panel.pts,upper.panel = panel.conf,diag.panel = panel.density)

Pelo car
library(car)
spm(~ `Gastos_saude` + `Gastos_lazer` + `Gastos_educacao`, data = dados, by.groups = F, legend = F)

spm(~ `Gastos_saude` + `Gastos_lazer` + `Gastos_educacao` | Profissao, data = dados, by.groups = TRUE, legend = F)

Regressão Linear
Importando os dados
library(foreign) ## Pacote Foreing é imcompatível com tidyverse - lê factor do spss ##
dados<- read.spss("C:/Users/user/Desktop/Vida acadêmica/Disciplinas/Estatística Vitor/Aula 3/Dados/Regressao.sav")
attach(dados)
library(tibble) # Resolve o problema anterior #
dados <- as_tibble(dados)
dados2 <- dados[,7:9] # Selecionar apenas as variáveis que temos interesse em trabalhar
Criando o modelo de regressão
reg <- lm(Gastos_educacao ~., data = dados2)
summary(reg) # Resumo do modelo
##
## Call:
## lm(formula = Gastos_educacao ~ ., data = dados2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -115.661 -27.284 -3.401 21.809 140.070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.77161 12.67814 0.534 0.594
## Gastos_saude 0.48772 0.08523 5.723 8.22e-08 ***
## Gastos_lazer 0.31038 0.07555 4.108 7.42e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.42 on 117 degrees of freedom
## Multiple R-squared: 0.6554, Adjusted R-squared: 0.6495
## F-statistic: 111.3 on 2 and 117 DF, p-value: < 2.2e-16
Avaliação do modelo
library(QuantPsyc)
lm.beta(reg) # valores padronizados
## Gastos_saude Gastos_lazer
## 0.4984003 0.3578131
reg$coefficients # Coeficientes
## (Intercept) Gastos_saude Gastos_lazer
## 6.7716131 0.4877232 0.3103845
confint(reg) # intervalo de confiança dos coenficientes
## 2.5 % 97.5 %
## (Intercept) -18.3367719 31.8799982
## Gastos_saude 0.3189333 0.6565131
## Gastos_lazer 0.1607626 0.4600065
attributes(reg)
## $names
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
##
## $class
## [1] "lm"
reg$residuals # Resíduos
## 1 2 3 4 5 6
## 8.3634527 -2.5703931 11.7618970 -16.7265027 -12.8437423 -54.0174274
## 7 8 9 10 11 12
## -7.5372200 29.8925726 -26.6492687 7.4220346 -24.0804711 -7.0976214
## 13 14 15 16 17 18
## 2.5705041 -16.9886448 12.6911752 33.5125748 -44.4135455 -9.3066095
## 19 20 21 22 23 24
## -13.3562795 10.8909361 13.3002035 61.5254364 58.2245308 -2.4799487
## 25 26 27 28 29 30
## -9.2399304 27.3418608 43.3285293 73.7075034 -18.8691957 -4.0203202
## 31 32 33 34 35 36
## -54.8592480 9.4448288 -31.1430678 -1.0130422 9.9823699 -57.1175370
## 37 38 39 40 41 42
## 13.8614342 -70.3352527 3.3935962 -16.3435324 -12.9843367 11.7386997
## 43 44 45 46 47 48
## -7.2231398 -29.1899776 -33.0587438 -2.7815943 22.0636254 11.5579561
## 49 50 51 52 53 54
## 54.6050464 -4.8696808 69.2931615 140.0702050 23.5576743 -69.6930647
## 55 56 57 58 59 60
## -51.3396623 -0.6097038 54.3110781 -23.6953159 -8.9216996 -56.3831082
## 61 62 63 64 65 66
## 0.9936549 80.9650165 -24.5465472 -18.7598765 24.8446653 38.8933283
## 67 68 69 70 71 72
## 138.6587158 -18.7280427 13.2172556 3.8136439 39.1603427 -25.8308848
## 73 74 75 76 77 78
## 75.8071875 -23.7410423 -21.5347860 -5.1079370 -34.3178134 -15.7308293
## 79 80 81 82 83 84
## -57.6546518 61.6634769 57.0592270 -2.7568010 44.3644819 -30.2491707
## 85 86 87 88 89 90
## 2.5128170 21.7241753 -33.7349861 69.2257583 -46.5802218 11.5779443
## 91 92 93 94 95 96
## -57.3143296 -15.3792346 71.7846987 -32.8310974 -52.2430165 -31.3418051
## 97 98 99 100 101 102
## 35.9005007 131.1325011 -16.2699197 -33.7607054 14.4566575 8.5567068
## 103 104 105 106 107 108
## 17.4956001 11.8462054 -6.6427302 -98.9584315 32.0056719 -30.5492327
## 109 110 111 112 113 114
## 13.9947866 -50.2903500 -115.6611205 -71.5015894 30.7914873 -45.0076109
## 115 116 117 118 119 120
## 62.3673767 54.0897106 -59.6192608 -42.0659612 -53.8936725 -12.9539942
par(mfrow = c(2,2), mar = c(0,0,0,0))
plot(reg) # Linearidade, normalidade, homocedasticidade, casos influentes

dev.off()
## null device
## 1
anova(reg)
## Analysis of Variance Table
##
## Response: Gastos_educacao
## Df Sum Sq Mean Sq F value Pr(>F)
## Gastos_saude 1 405737 405737 205.633 < 2.2e-16 ***
## Gastos_lazer 1 33303 33303 16.879 7.421e-05 ***
## Residuals 117 230854 1973
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Distância d Cook
plot(reg, pch = 18, col = "red", which=c(4))

cooks.distance(reg)
## 1 2 3 4 5 6
## 3.626821e-04 3.567185e-05 2.979681e-04 1.843818e-03 7.679278e-04 4.580775e-03
## 7 8 9 10 11 12
## 6.495522e-04 6.428010e-03 4.970680e-03 1.395884e-04 3.494716e-03 7.032325e-04
## 13 14 15 16 17 18
## 2.082471e-05 2.793338e-03 5.198148e-04 7.453809e-03 7.441787e-03 1.545638e-04
## 19 20 21 22 23 24
## 3.971420e-04 6.505589e-04 6.929172e-04 1.077426e-02 7.853514e-03 3.912614e-05
## 25 26 27 28 29 30
## 2.788476e-04 1.845123e-03 1.306472e-02 1.099294e-02 7.926165e-04 1.444299e-04
## 31 32 33 34 35 36
## 5.375211e-03 3.542030e-04 6.180320e-03 1.505719e-06 3.018508e-04 1.678888e-02
## 37 38 39 40 41 42
## 8.068665e-04 1.424050e-02 3.234163e-05 3.922071e-04 5.446103e-04 7.242952e-04
## 43 44 45 46 47 48
## 9.592285e-05 2.745744e-03 1.157433e-02 1.740064e-05 1.378161e-03 2.126748e-04
## 49 50 51 52 53 54
## 8.958884e-03 1.349452e-04 1.203245e-02 3.829091e-02 1.612302e-03 1.633712e-02
## 55 56 57 58 59 60
## 3.914979e-03 1.213124e-06 1.770253e-02 1.567729e-03 3.882119e-04 7.803860e-03
## 61 62 63 64 65 66
## 3.150686e-06 9.624244e-03 1.922356e-03 6.169198e-04 2.544011e-03 4.343879e-03
## 67 68 69 70 71 72
## 1.580055e-01 1.682506e-03 5.662678e-03 5.787266e-05 6.681944e-03 5.921682e-03
## 73 74 75 76 77 78
## 1.319510e-02 3.191760e-03 1.577229e-03 2.239681e-04 2.255262e-03 6.573966e-04
## 79 80 81 82 83 84
## 5.113247e-03 1.382224e-02 5.882502e-03 8.606703e-05 1.445028e-02 2.942380e-03
## 85 86 87 88 89 90
## 1.107930e-05 8.876767e-04 9.936581e-03 2.067455e-02 4.953311e-03 8.997315e-04
## 91 92 93 94 95 96
## 1.920048e-02 1.826554e-03 6.358865e-02 5.523618e-03 2.076753e-02 3.090727e-03
## 97 98 99 100 101 102
## 2.000111e-03 5.747722e-02 8.525485e-04 3.265669e-03 3.755280e-04 1.374947e-04
## 103 104 105 106 107 108
## 1.424988e-03 6.143592e-04 1.787136e-04 2.486814e-02 4.126715e-03 3.698866e-03
## 109 110 111 112 113 114
## 1.559297e-03 8.568435e-03 5.785654e-02 1.865775e-02 4.549437e-03 4.514693e-03
## 115 116 117 118 119 120
## 1.023297e-02 5.271212e-03 1.600172e-02 1.415505e-02 9.505620e-03 1.266218e-03
meuCD <- cooks.distance(reg)
mecdr <- round(meuCD,5)
sort(mecdr)
## 34 56 61 85 13 46 39 2 24 70
## 0.00000 0.00000 0.00000 0.00001 0.00002 0.00002 0.00003 0.00004 0.00004 0.00006
## 82 43 50 10 30 102 18 105 48 76
## 0.00009 0.00010 0.00013 0.00014 0.00014 0.00014 0.00015 0.00018 0.00021 0.00022
## 25 3 35 32 1 101 40 59 19 15
## 0.00028 0.00030 0.00030 0.00035 0.00036 0.00038 0.00039 0.00039 0.00040 0.00052
## 41 104 64 7 20 78 21 12 42 5
## 0.00054 0.00061 0.00062 0.00065 0.00065 0.00066 0.00069 0.00070 0.00072 0.00077
## 29 37 99 86 90 120 47 103 109 58
## 0.00079 0.00081 0.00085 0.00089 0.00090 0.00127 0.00138 0.00142 0.00156 0.00157
## 75 53 68 92 4 26 63 97 77 65
## 0.00158 0.00161 0.00168 0.00183 0.00184 0.00185 0.00192 0.00200 0.00226 0.00254
## 44 14 84 96 74 100 11 108 55 107
## 0.00275 0.00279 0.00294 0.00309 0.00319 0.00327 0.00349 0.00370 0.00391 0.00413
## 66 114 113 6 89 9 79 116 31 94
## 0.00434 0.00451 0.00455 0.00458 0.00495 0.00497 0.00511 0.00527 0.00538 0.00552
## 69 81 72 33 8 71 17 16 60 23
## 0.00566 0.00588 0.00592 0.00618 0.00643 0.00668 0.00744 0.00745 0.00780 0.00785
## 110 49 119 62 87 115 22 28 45 51
## 0.00857 0.00896 0.00951 0.00962 0.00994 0.01023 0.01077 0.01099 0.01157 0.01203
## 27 73 80 118 38 83 117 54 36 57
## 0.01306 0.01320 0.01382 0.01416 0.01424 0.01445 0.01600 0.01634 0.01679 0.01770
## 112 91 88 95 106 52 98 111 93 67
## 0.01866 0.01920 0.02067 0.02077 0.02487 0.03829 0.05748 0.05786 0.06359 0.15801
dados$cooks.distance <- cooks.distance(reg)
library(plotly)
library(magrittr)
plot_ly(data = dados, y = dados$cooks.distance, type = "scatter") %>% layout(title = "Distância de Cook") # O valor mostrado é sempre +1 na base de dados
Calcular a correlação entre as VI’s (multicolineariedade)
round(cor(dados2),2)
## Gastos_saude Gastos_lazer Gastos_educacao
## Gastos_saude 1.00 0.78 0.78
## Gastos_lazer 0.78 1.00 0.75
## Gastos_educacao 0.78 0.75 1.00
library(car)
vif(reg) # Ponto de corte: VIF até 10
## Gastos_saude Gastos_lazer
## 2.575318 2.575318
mean(vif(reg))
## [1] 2.575318
1/vif(reg) # Tolerância: Ponto de corte: valores menores do que 0.2
## Gastos_saude Gastos_lazer
## 0.3883015 0.3883015
Durbin-Watson (autocorrelação dos resíduos)
library(lmtest)
dwtest(reg) # Valores próximos a 2
##
## Durbin-Watson test
##
## data: reg
## DW = 1.8241, p-value = 0.1552
## alternative hypothesis: true autocorrelation is greater than 0
library(car)
durbinWatsonTest(reg) #H0 resíduos são aleatórios
## lag Autocorrelation D-W Statistic p-value
## 1 0.08742109 1.824128 0.296
## Alternative hypothesis: rho != 0
Resíduos
res <- rstandard(lm(Gastos_educacao ~., data = dados2))
res <- as.data.frame(res)
Outliers dos resíudos
dados$Zscoreresiduos <- res
dados$Zscoreresiduos
## res
## 1 0.19106773
## 2 -0.05875611
## 3 0.26645191
## 4 -0.38356939
## 5 -0.29299923
## 6 -1.22165535
## 7 -0.17499727
## 8 0.68658446
## 9 -0.61177766
## 10 0.16831918
## 11 -0.55138069
## 12 -0.16580330
## 13 0.05839625
## 14 -0.39271095
## 15 0.28837696
## 16 0.76859963
## 17 -1.01072808
## 18 -0.21060769
## 19 -0.30263319
## 20 0.24901132
## 21 0.30279665
## 22 1.39652446
## 23 1.31961946
## 24 -0.05683525
## 25 -0.20997836
## 26 0.61995141
## 27 0.99457236
## 28 1.66913520
## 29 -0.42754746
## 30 -0.09275853
## 31 -1.24146502
## 32 0.21505584
## 33 -0.71375376
## 34 -0.02290413
## 35 0.22669973
## 36 -1.30474411
## 37 0.31581995
## 38 -1.59663785
## 39 0.07702071
## 40 -0.36951630
## 41 -0.29504083
## 42 0.26822887
## 43 -0.16348428
## 44 -0.66326426
## 45 -0.76594406
## 46 -0.06303081
## 47 0.50078587
## 48 0.26141080
## 49 1.23999559
## 50 -0.11140259
## 51 1.57132682
## 52 3.17129358
## 53 0.53480859
## 54 -1.58421356
## 55 -1.16081224
## 56 -0.01385548
## 57 1.24349956
## 58 -0.53776202
## 59 -0.20365066
## 60 -1.27838639
## 61 0.02257616
## 62 1.83056377
## 63 -0.55770465
## 64 -0.42449575
## 65 0.56594111
## 66 0.88287651
## 67 3.19328916
## 68 -0.42740106
## 69 0.32112879
## 70 0.08683754
## 71 0.89261986
## 72 -0.59588817
## 73 1.71801828
## 74 -0.54307788
## 75 -0.48956485
## 76 -0.11774613
## 77 -0.77689919
## 78 -0.35687238
## 79 -1.30379579
## 80 1.40275232
## 81 1.29132739
## 82 -0.06398945
## 83 1.01937752
## 84 -0.68731845
## 85 0.05685993
## 86 0.49175188
## 87 -0.77793966
## 88 1.57774353
## 89 -1.05560799
## 90 0.26558911
## 91 -1.31171132
## 92 -0.35372578
## 93 1.67038748
## 94 -0.74992143
## 95 -1.20124481
## 96 -0.71200693
## 97 0.81188170
## 98 2.98063647
## 99 -0.36968858
## 100 -0.76635229
## 101 0.32716433
## 102 0.19368930
## 103 0.39912042
## 104 0.27003744
## 105 -0.15128613
## 106 -2.24424441
## 107 0.72887595
## 108 -0.69558263
## 109 0.32208338
## 110 -1.14324258
## 111 -2.63614247
## 112 -1.62661945
## 113 0.70270898
## 114 -1.01981190
## 115 1.41477417
## 116 1.22410529
## 117 -1.35949972
## 118 -0.96822305
## 119 -1.22476179
## 120 -0.29780718
round(dados$Zscoreresiduos, digits = 3)
## res
## 1 0.191
## 2 -0.059
## 3 0.266
## 4 -0.384
## 5 -0.293
## 6 -1.222
## 7 -0.175
## 8 0.687
## 9 -0.612
## 10 0.168
## 11 -0.551
## 12 -0.166
## 13 0.058
## 14 -0.393
## 15 0.288
## 16 0.769
## 17 -1.011
## 18 -0.211
## 19 -0.303
## 20 0.249
## 21 0.303
## 22 1.397
## 23 1.320
## 24 -0.057
## 25 -0.210
## 26 0.620
## 27 0.995
## 28 1.669
## 29 -0.428
## 30 -0.093
## 31 -1.241
## 32 0.215
## 33 -0.714
## 34 -0.023
## 35 0.227
## 36 -1.305
## 37 0.316
## 38 -1.597
## 39 0.077
## 40 -0.370
## 41 -0.295
## 42 0.268
## 43 -0.163
## 44 -0.663
## 45 -0.766
## 46 -0.063
## 47 0.501
## 48 0.261
## 49 1.240
## 50 -0.111
## 51 1.571
## 52 3.171
## 53 0.535
## 54 -1.584
## 55 -1.161
## 56 -0.014
## 57 1.243
## 58 -0.538
## 59 -0.204
## 60 -1.278
## 61 0.023
## 62 1.831
## 63 -0.558
## 64 -0.424
## 65 0.566
## 66 0.883
## 67 3.193
## 68 -0.427
## 69 0.321
## 70 0.087
## 71 0.893
## 72 -0.596
## 73 1.718
## 74 -0.543
## 75 -0.490
## 76 -0.118
## 77 -0.777
## 78 -0.357
## 79 -1.304
## 80 1.403
## 81 1.291
## 82 -0.064
## 83 1.019
## 84 -0.687
## 85 0.057
## 86 0.492
## 87 -0.778
## 88 1.578
## 89 -1.056
## 90 0.266
## 91 -1.312
## 92 -0.354
## 93 1.670
## 94 -0.750
## 95 -1.201
## 96 -0.712
## 97 0.812
## 98 2.981
## 99 -0.370
## 100 -0.766
## 101 0.327
## 102 0.194
## 103 0.399
## 104 0.270
## 105 -0.151
## 106 -2.244
## 107 0.729
## 108 -0.696
## 109 0.322
## 110 -1.143
## 111 -2.636
## 112 -1.627
## 113 0.703
## 114 -1.020
## 115 1.415
## 116 1.224
## 117 -1.359
## 118 -0.968
## 119 -1.225
## 120 -0.298
dados$large.residuals <- dados$Zscoreresiduos > 2.9 | dados$Zscoreresiduos < - 2.9
summary(dados$large.residuals)
## res
## Mode :logical
## FALSE:117
## TRUE :3
Normalidade para os resíduos (Shapiro-Wilk)
res <- rstandard(lm(Gastos_educacao ~., data = dados2))
shapiro.test(res)
##
## Shapiro-Wilk normality test
##
## data: res
## W = 0.96922, p-value = 0.007498
hist(res, breaks = 15, freq = T, col = "lightgray", main = "Distribuição dos resíduos", xlab = "Escore")

qq <- qplot(sample = res)
ggplotly(qq)
Teste de variância - homocedasticidade (Breush-Pagan)
library(ggplot2)
dados$fitted <- reg$fitted.values
dados$standr <- reg$residuals
dados[,c("Gastos_educacao", "fitted")]
## # A tibble: 120 x 2
## Gastos_educacao fitted
## <dbl> <dbl>
## 1 97.5 89.1
## 2 186. 189.
## 3 154. 142.
## 4 57.1 73.8
## 5 81.8 94.6
## 6 126. 180.
## 7 134. 142.
## 8 103. 73.5
## 9 43.2 69.8
## 10 159. 151.
## # ... with 110 more rows
ggplot(data=dados, aes(fitted, standr)) + geom_point() + geom_smooth(color = "blue") + labs(x= "Valores ajustados", y = "Resíduos padronizados")

ggplot(data=dados, aes(fitted,standr)) + geom_point() + geom_smooth(method = "lm", color = "blue") + labs(x= "Valores ajustados", y = "Resíduos padronizados")

library(car)
ncvTest(lm(Gastos_educacao ~., data = dados2)) #p = 0.19005
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 1.717224, Df = 1, p = 0.19005
Regressaõ sem outliers
dados <- subset(dados, dados$large.residuals == "FALSE")
reg2 <- lm(Gastos_educacao ~ Gastos_lazer + Gastos_saude, data = dados)
summary(reg2)
##
## Call:
## lm(formula = Gastos_educacao ~ Gastos_lazer + Gastos_saude, data = dados)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.271 -25.631 -2.201 18.593 84.620
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.87481 11.19846 0.167 0.867
## Gastos_lazer 0.32816 0.06701 4.897 3.23e-06 ***
## Gastos_saude 0.47246 0.07661 6.167 1.09e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38.99 on 114 degrees of freedom
## Multiple R-squared: 0.7158, Adjusted R-squared: 0.7108
## F-statistic: 143.6 on 2 and 114 DF, p-value: < 2.2e-16
Regressão sem o intercepto
reg2 <- lm(Gastos_educacao ~ Gastos_lazer + Gastos_saude, data = dados)
summary(reg2) # Intercepto não é significativo no modelo
##
## Call:
## lm(formula = Gastos_educacao ~ Gastos_lazer + Gastos_saude, data = dados)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.271 -25.631 -2.201 18.593 84.620
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.87481 11.19846 0.167 0.867
## Gastos_lazer 0.32816 0.06701 4.897 3.23e-06 ***
## Gastos_saude 0.47246 0.07661 6.167 1.09e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38.99 on 114 degrees of freedom
## Multiple R-squared: 0.7158, Adjusted R-squared: 0.7108
## F-statistic: 143.6 on 2 and 114 DF, p-value: < 2.2e-16
reg4 <- lm(Gastos_educacao ~ -1 + Gastos_lazer + Gastos_saude, data = dados)
summary(reg4)
##
## Call:
## lm(formula = Gastos_educacao ~ -1 + Gastos_lazer + Gastos_saude,
## data = dados)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.761 -24.874 -1.214 18.909 84.879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Gastos_lazer 0.33288 0.06054 5.498 2.35e-07 ***
## Gastos_saude 0.47494 0.07485 6.346 4.54e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38.82 on 115 degrees of freedom
## Multiple R-squared: 0.9602, Adjusted R-squared: 0.9596
## F-statistic: 1389 on 2 and 115 DF, p-value: < 2.2e-16
Regressão Polinomial
reg5 <- lm(Gastos_educacao ~ poly(Gastos_lazer,2) + poly(Gastos_saude,2), data = dados)
summary(reg5)
##
## Call:
## lm(formula = Gastos_educacao ~ poly(Gastos_lazer, 2) + poly(Gastos_saude,
## 2), data = dados)
##
## Residuals:
## Min 1Q Median 3Q Max
## -111.116 -26.763 -2.869 20.454 86.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 179.037 3.626 49.381 < 2e-16 ***
## poly(Gastos_lazer, 2)1 309.224 63.612 4.861 3.83e-06 ***
## poly(Gastos_lazer, 2)2 -36.191 60.138 -0.602 0.549
## poly(Gastos_saude, 2)1 387.513 63.723 6.081 1.69e-08 ***
## poly(Gastos_saude, 2)2 48.797 60.020 0.813 0.418
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.22 on 112 degrees of freedom
## Multiple R-squared: 0.7175, Adjusted R-squared: 0.7074
## F-statistic: 71.1 on 4 and 112 DF, p-value: < 2.2e-16
reg6 <- lm(Gastos_educacao ~ poly(Gastos_lazer,3) + poly(Gastos_saude,3), data = dados)
summary(reg6)
##
## Call:
## lm(formula = Gastos_educacao ~ poly(Gastos_lazer, 3) + poly(Gastos_saude,
## 3), data = dados)
##
## Residuals:
## Min 1Q Median 3Q Max
## -104.489 -24.529 -1.938 21.496 84.159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 179.037 3.613 49.556 < 2e-16 ***
## poly(Gastos_lazer, 3)1 271.136 69.090 3.924 0.000152 ***
## poly(Gastos_lazer, 3)2 -45.928 60.416 -0.760 0.448766
## poly(Gastos_lazer, 3)3 -18.563 53.547 -0.347 0.729508
## poly(Gastos_saude, 3)1 417.398 67.317 6.200 1.01e-08 ***
## poly(Gastos_saude, 3)2 55.299 60.558 0.913 0.363154
## poly(Gastos_saude, 3)3 80.197 55.605 1.442 0.152072
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.08 on 110 degrees of freedom
## Multiple R-squared: 0.7245, Adjusted R-squared: 0.7094
## F-statistic: 48.2 on 6 and 110 DF, p-value: < 2.2e-16