#library(tidyverse)
library(stringr)
library(readxl)
library(faraway)
library(mctest)
library(REdaS)
## Loading required package: grid
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:faraway':
##
## logit
tbl <- read_excel('original.xlsx')
tbl
## # A tibble: 44 x 39
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3 3 2 3 3 4 3 3 2 3 3 4 2
## 2 1 5 2 1 5 4 2 4 2 4 4 4 3
## 3 3 3 1 2 5 5 2 4 2 5 3 5 1
## 4 3 3 2 3 1 1 5 1 1 5 5 5 1
## 5 2 3 4 2 2 3 3 1 2 4 3 4 2
## 6 1 5 3 1 5 5 3 5 3 5 5 5 3
## 7 1 5 3 1 4 1 3 3 2 5 4 4 1
## 8 3 3 3 3 3 3 3 3 3 3 3 3 3
## 9 1 1 5 1 5 3 3 3 3 3 3 3 3
## 10 2 3 2 1 5 4 3 3 2 5 5 4 4
## # ... with 34 more rows, and 26 more variables: V14 <dbl>, V15 <dbl>,
## # V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>,
## # V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>,
## # V28 <dbl>, V29 <dbl>, V30 <dbl>, V31 <dbl>, V32 <dbl>, V33 <dbl>,
## # V34 <dbl>, V35 <dbl>, V36 <dbl>, V37 <dbl>, V38 <dbl>, V39 <dbl>
Excluir as variáveis dicotômicas
Testar Multicolinearidade 3.1 verificar multicolinearidade com vif (vide paper Multicolinearidade.pdf, p.49) e mctest (vide paper Collinearity.pdf) todas as variáveis com VIF> 5 ou 10 devem ser excluídas
vif(tbl)
## V1 V2 V3 V4 V5 V6 V7
## 90.784280 31.198154 12.300448 29.527466 22.556435 6.019989 6.590458
## V8 V9 V10 V11 V12 V13 V14
## 23.926717 16.669632 20.545358 12.565751 26.671694 14.261724 21.487539
## V15 V16 V17 V18 V19 V20 V21
## 11.734746 23.389864 18.430357 31.185097 10.765581 16.081202 35.230609
## V22 V23 V24 V25 V26 V27 V28
## 11.107738 79.193734 5.709470 8.998989 39.415580 49.532076 9.234147
## V29 V30 V31 V32 V33 V34 V35
## 12.830974 12.427203 32.354667 117.043875 121.444058 8.917779 25.345198
## V36 V37 V38 V39
## 27.465033 11.484875 7.398812 9.165682
tbl1 <- tbl[vif(tbl) > 10]
tbl1
## # A tibble: 44 x 31
## V1 V2 V3 V4 V5 V8 V9 V10 V11 V12 V13 V14 V15
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3 3 2 3 3 3 2 3 3 4 2 4 4
## 2 1 5 2 1 5 4 2 4 4 4 3 3 4
## 3 3 3 1 2 5 4 2 5 3 5 1 2 5
## 4 3 3 2 3 1 1 1 5 5 5 1 1 5
## 5 2 3 4 2 2 1 2 4 3 4 2 3 5
## 6 1 5 3 1 5 5 3 5 5 5 3 4 5
## 7 1 5 3 1 4 3 2 5 4 4 1 3 5
## 8 3 3 3 3 3 3 3 3 3 3 3 3 3
## 9 1 1 5 1 5 3 3 3 3 3 3 3 3
## 10 2 3 2 1 5 3 2 5 5 4 4 4 4
## # ... with 34 more rows, and 18 more variables: V16 <dbl>, V17 <dbl>,
## # V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>,
## # V26 <dbl>, V27 <dbl>, V29 <dbl>, V30 <dbl>, V31 <dbl>, V32 <dbl>,
## # V33 <dbl>, V35 <dbl>, V36 <dbl>, V37 <dbl>
head(tbl1)
## # A tibble: 6 x 31
## V1 V2 V3 V4 V5 V8 V9 V10 V11 V12 V13 V14 V15
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3 3 2 3 3 3 2 3 3 4 2 4 4
## 2 1 5 2 1 5 4 2 4 4 4 3 3 4
## 3 3 3 1 2 5 4 2 5 3 5 1 2 5
## 4 3 3 2 3 1 1 1 5 5 5 1 1 5
## 5 2 3 4 2 2 1 2 4 3 4 2 3 5
## 6 1 5 3 1 5 5 3 5 5 5 3 4 5
## # ... with 18 more variables: V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>,
## # V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V26 <dbl>, V27 <dbl>,
## # V29 <dbl>, V30 <dbl>, V31 <dbl>, V32 <dbl>, V33 <dbl>, V35 <dbl>,
## # V36 <dbl>, V37 <dbl>
x <- tbl[, -1]
y <- tbl[, 1]
indica as variáveis com multicolinearidade (vide paper Collinearity.pdf)
library(mctest)
head(Hald)
## y X1 X2 X3 X4
## [1,] 78.5 7 26 6 60
## [2,] 74.3 1 29 15 52
## [3,] 104.3 11 56 8 20
## [4,] 87.6 11 31 8 47
## [5,] 95.9 7 52 6 33
## [6,] 109.2 11 55 9 22
x <- Hald[, -1]
y <- Hald[, 1]
model <- lm(y ~ X1 + X2 + X3 + X4, data = as.data.frame(Hald))
# Collinearity Diagnostic measures in matrix of 0 or 1
imcdiag(model, all = TRUE)
##
## Call:
## imcdiag(mod = model, all = TRUE)
##
##
## All Individual Multicollinearity Diagnostics in 0 or 1
##
## VIF TOL Wi Fi Leamer CVIF Klein IND1 IND2
## X1 1 1 1 1 0 0 0 1 1
## X2 1 1 1 1 1 0 1 1 1
## X3 1 1 1 1 0 0 0 1 1
## X4 1 1 1 1 1 0 1 1 1
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
##
## X1 , X2 , X3 , X4 , coefficient(s) are non-significant may be due to multicollinearity
##
## R-square of y on all x: 0.9824
##
## * use method argument to check which regressors may be the reason of collinearity
## ===================================
imcdiag(model, method = "VIF", all = TRUE)
##
## Call:
## imcdiag(mod = model, method = "VIF", all = TRUE)
##
##
## VIF Multicollinearity Diagnostics
##
## VIF detection
## X1 38.4962 1
## X2 254.4232 1
## X3 46.8684 1
## X4 282.5129 1
##
## All Individual Multicollinearity Diagnostics in 0 or 1
##
## VIF TOL Wi Fi Leamer CVIF Klein IND1 IND2
## X1 1 1 1 1 0 0 0 1 1
## X2 1 1 1 1 1 0 1 1 1
## X3 1 1 1 1 0 0 0 1 1
## X4 1 1 1 1 1 0 1 1 1
##
## Multicollinearity may be due to X1 X2 X3 X4 regressors
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
##
## ===================================
y1 <- (c(tbl)) # onde y é a nova tabela.
y2 <- rnorm(y1)
shapiro.test(y2)
##
## Shapiro-Wilk normality test
##
## data: y2
## W = 0.98162, p-value = 0.7618
Interpretação W=1.0 is the “perfect fit”. The W statistic is > pretty much the Pearson correlation applied to the curve drawn by > qqnorm(). In case of the null hypothesis of shapiro.test, a p-value <= 0.05 would reject the null hypothesis that the samples come from normal distribution.
Ref: https://www.rdocumentation.org/packages/REdaS/versions/0.9.3/topics/Bartlett-Sphericity
From the output we can see that the p-value of 0.2371 is not less than the significance level of 0.05. This means we cannot reject the null hypothesis that the variance is the same for all treatment groups. This means that there is no evidence to suggest that the variance in plant growth is different for the three treatment groups.
For Factor Analysis to be recommended suitable, the Bartlett’s Test of Sphericity must be less than 0.05.
bartlett.test(tbl)
##
## Bartlett test of homogeneity of variances
##
## data: tbl
## Bartlett's K-squared = 103.62, df = 38, p-value = 5.39e-08
datamatrix <- data.frame("A" = rnorm(100), "B" = rnorm(100), "C" = rnorm(100))
head(datamatrix)
## A B C
## 1 0.94134839 1.1163323 0.2865841
## 2 -0.90638169 1.6050925 0.4783473
## 3 -0.46250610 -0.9855091 1.1257085
## 4 -0.03222622 -0.1194222 -1.5910541
## 5 0.65481876 -0.5670538 1.0443353
## 6 0.42254332 1.1800630 0.7532627
# correlation matrix
cor(datamatrix)
## A B C
## A 1.00000000 0.1279772 -0.06272345
## B 0.12797718 1.0000000 0.18364892
## C -0.06272345 0.1836489 1.00000000
# bartlett's test
bart_spher(datamatrix)
## Bartlett's Test of Sphericity
##
## Call: bart_spher(x = datamatrix)
##
## X2 = 5.701
## df = 3
## p-value = .12708
#capture.output(Bts,file="Bts2.txt")
corMat <- cor(tbl1)
bart_spher(corMat)
## Bartlett's Test of Sphericity
##
## Call: bart_spher(x = corMat)
##
## X2 = 2400.262
## df = 465
## p-value < 2.22e-16
ks.test(y2, "pnorm",mean = mean(y2), sd = sqrt(var(y2)))
##
## One-sample Kolmogorov-Smirnov test
##
## data: y2
## D = 0.091997, p-value = 0.8662
## alternative hypothesis: two-sided
It means that there is very little evidence that they come from different distributions. With a p-value as high as .2653, it is reasonable to assume that they come from the same distribution.
Assim, não há evidências para rejeitar a hipótese de normalidade dos dados.
O teste Kaiser-Meyer-Olkin (KMO) verifica a adequação da amostra que varia de 0,0 `a 1,0. Kaiser (1974) sugere que menos de 0,5 é inaceitável, entre 0,5 e 0,6 é miserável, de 0,6 a 0,7 é medíocre, de 0,7 a 0,8 é moderado (middling), de 0,8 a 0,9 é meritório e de 0,9 a 1,0 é maravilhoso. O MSA (medidas individuais de adequação da amostra para cada item) abaixo de 0,5 indica que o item não pertence ao grupo e pode ser removido da AFE.
Para os testes KMO e MSA, fez-se uso da metodologia de Nakazawa proposta para o R como sugerida por Bates (2011).
x <- subset(tbl1, complete.cases(tbl1))
r <- cor(x)
r2 <- r^2
i <- solve(r)
d <- diag(i)
p2 <- (-i/sqrt(outer(d, d)))^2
diag(r2) <- diag(p2) <- 0
KMO <- sum(r2) / (sum(r2) + sum(p2))
KMO
## [1] 0.505774
MSA <- colSums(r2) / (colSums(r2) + colSums(p2))
MSA
## V1 V2 V3 V4 V5 V8 V9 V10
## 0.4609946 0.4477631 0.6733416 0.5670581 0.5489502 0.2767630 0.2763022 0.4998019
## V11 V12 V13 V14 V15 V16 V17 V18
## 0.6053480 0.5109641 0.4202440 0.3865621 0.8186567 0.3409139 0.7342401 0.3208846
## V19 V20 V21 V22 V23 V26 V27 V29
## 0.4849010 0.6136971 0.5262870 0.7225698 0.3658816 0.5456820 0.6818296 0.6916242
## V30 V31 V32 V33 V35 V36 V37
## 0.4204208 0.6254519 0.5232902 0.5188519 0.4685922 0.4805286 0.3222689
fa <- factanal(tbl, factors = 3, scores = 'regression')
fa
##
## Call:
## factanal(x = tbl, factors = 3, scores = "regression")
##
## Uniquenesses:
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
## 0.371 0.601 0.518 0.416 0.454 0.592 0.605 0.671 0.722 0.505 0.507 0.709 0.665
## V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26
## 0.653 0.325 0.641 0.358 0.575 0.684 0.409 0.412 0.404 0.530 0.417 0.656 0.387
## V27 V28 V29 V30 V31 V32 V33 V34 V35 V36 V37 V38 V39
## 0.464 0.798 0.345 0.768 0.521 0.096 0.023 0.888 0.823 0.610 0.891 0.983 0.905
##
## Loadings:
## Factor1 Factor2 Factor3
## V1 -0.665 -0.300 -0.310
## V2 0.528 0.337
## V3 0.559 0.112 0.397
## V4 -0.655 -0.209 -0.335
## V5 0.673 0.219 0.214
## V6 0.554 0.279 0.154
## V7 0.213 0.345 0.481
## V8 0.566
## V9 0.495 -0.182
## V10 0.655 0.247
## V11 0.275 0.614 0.201
## V12 0.517 -0.147
## V13 0.576
## V14 0.557 0.168
## V15 0.782 0.250
## V16 0.591
## V17 0.265 0.751
## V18 0.636 0.135
## V19 0.454 -0.238 0.230
## V20 0.539 0.316 0.448
## V21 0.424 0.610 0.188
## V22 0.262 0.701 0.191
## V23 0.647 0.179 0.141
## V24 0.750 0.132
## V25 0.573
## V26 0.617 0.426 0.225
## V27 0.312 -0.121 0.651
## V28 -0.159 0.227 0.354
## V29 0.257 0.188 0.744
## V30 0.257 0.407
## V31 0.320 0.276 0.549
## V32 0.151 0.938
## V33 0.983
## V34 0.230 0.232
## V35 0.407
## V36 0.317 0.533
## V37 0.322
## V38
## V39 -0.205 0.113 -0.199
##
## Factor1 Factor2 Factor3
## SS loadings 6.459 5.356 5.284
## Proportion Var 0.166 0.137 0.135
## Cumulative Var 0.166 0.303 0.438
##
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 839.59 on 627 degrees of freedom.
## The p-value is 2.49e-08
corMat <- cor(tbl1)
faPC <- fa(r=corMat, nfactors = 3, n.obs = 46, rotate = 'varimax')
fa.diagram(faPC)
Para o teste de Bartlett vide arquivo “TESTES”.
Para a determinação do número de fatores procedeu-se ao teste ‘PARALLEL’ o qual indica 4 fatores.
fa.parallel(tbl)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Parallel analysis suggests that the number of factors = 3 and the number of components = 3