#library(tidyverse)
library(stringr)
library(readxl)
library(faraway)
library(mctest)
library(REdaS)

## Loading required package: grid

library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'package:faraway':
## 
##     logit

Carregar a tabela “original”

tbl <- read_excel('original.xlsx')
tbl

## # A tibble: 44 x 39
##       V1    V2    V3    V4    V5    V6    V7    V8    V9   V10   V11   V12   V13
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1     3     3     2     3     3     4     3     3     2     3     3     4     2
##  2     1     5     2     1     5     4     2     4     2     4     4     4     3
##  3     3     3     1     2     5     5     2     4     2     5     3     5     1
##  4     3     3     2     3     1     1     5     1     1     5     5     5     1
##  5     2     3     4     2     2     3     3     1     2     4     3     4     2
##  6     1     5     3     1     5     5     3     5     3     5     5     5     3
##  7     1     5     3     1     4     1     3     3     2     5     4     4     1
##  8     3     3     3     3     3     3     3     3     3     3     3     3     3
##  9     1     1     5     1     5     3     3     3     3     3     3     3     3
## 10     2     3     2     1     5     4     3     3     2     5     5     4     4
## # ... with 34 more rows, and 26 more variables: V14 <dbl>, V15 <dbl>,
## #   V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>,
## #   V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>,
## #   V28 <dbl>, V29 <dbl>, V30 <dbl>, V31 <dbl>, V32 <dbl>, V33 <dbl>,
## #   V34 <dbl>, V35 <dbl>, V36 <dbl>, V37 <dbl>, V38 <dbl>, V39 <dbl>

Excluir as variáveis dicotômicas
Testar Multicolinearidade 3.1 verificar multicolinearidade com vif (vide paper Multicolinearidade.pdf, p.49) e mctest (vide paper Collinearity.pdf) todas as variáveis com VIF> 5 ou 10 devem ser excluídas

vif(tbl)

##         V1         V2         V3         V4         V5         V6         V7 
##  90.784280  31.198154  12.300448  29.527466  22.556435   6.019989   6.590458 
##         V8         V9        V10        V11        V12        V13        V14 
##  23.926717  16.669632  20.545358  12.565751  26.671694  14.261724  21.487539 
##        V15        V16        V17        V18        V19        V20        V21 
##  11.734746  23.389864  18.430357  31.185097  10.765581  16.081202  35.230609 
##        V22        V23        V24        V25        V26        V27        V28 
##  11.107738  79.193734   5.709470   8.998989  39.415580  49.532076   9.234147 
##        V29        V30        V31        V32        V33        V34        V35 
##  12.830974  12.427203  32.354667 117.043875 121.444058   8.917779  25.345198 
##        V36        V37        V38        V39 
##  27.465033  11.484875   7.398812   9.165682

tbl1 <- tbl[vif(tbl) > 10]
tbl1

## # A tibble: 44 x 31
##       V1    V2    V3    V4    V5    V8    V9   V10   V11   V12   V13   V14   V15
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1     3     3     2     3     3     3     2     3     3     4     2     4     4
##  2     1     5     2     1     5     4     2     4     4     4     3     3     4
##  3     3     3     1     2     5     4     2     5     3     5     1     2     5
##  4     3     3     2     3     1     1     1     5     5     5     1     1     5
##  5     2     3     4     2     2     1     2     4     3     4     2     3     5
##  6     1     5     3     1     5     5     3     5     5     5     3     4     5
##  7     1     5     3     1     4     3     2     5     4     4     1     3     5
##  8     3     3     3     3     3     3     3     3     3     3     3     3     3
##  9     1     1     5     1     5     3     3     3     3     3     3     3     3
## 10     2     3     2     1     5     3     2     5     5     4     4     4     4
## # ... with 34 more rows, and 18 more variables: V16 <dbl>, V17 <dbl>,
## #   V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>,
## #   V26 <dbl>, V27 <dbl>, V29 <dbl>, V30 <dbl>, V31 <dbl>, V32 <dbl>,
## #   V33 <dbl>, V35 <dbl>, V36 <dbl>, V37 <dbl>

head(tbl1)

## # A tibble: 6 x 31
##      V1    V2    V3    V4    V5    V8    V9   V10   V11   V12   V13   V14   V15
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     3     3     2     3     3     3     2     3     3     4     2     4     4
## 2     1     5     2     1     5     4     2     4     4     4     3     3     4
## 3     3     3     1     2     5     4     2     5     3     5     1     2     5
## 4     3     3     2     3     1     1     1     5     5     5     1     1     5
## 5     2     3     4     2     2     1     2     4     3     4     2     3     5
## 6     1     5     3     1     5     5     3     5     5     5     3     4     5
## # ... with 18 more variables: V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>,
## #   V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V26 <dbl>, V27 <dbl>,
## #   V29 <dbl>, V30 <dbl>, V31 <dbl>, V32 <dbl>, V33 <dbl>, V35 <dbl>,
## #   V36 <dbl>, V37 <dbl>

x <- tbl[, -1] 
y <- tbl[, 1]

indica as variáveis com multicolinearidade (vide paper Collinearity.pdf)

library(mctest) 
head(Hald)

##          y X1 X2 X3 X4
## [1,]  78.5  7 26  6 60
## [2,]  74.3  1 29 15 52
## [3,] 104.3 11 56  8 20
## [4,]  87.6 11 31  8 47
## [5,]  95.9  7 52  6 33
## [6,] 109.2 11 55  9 22

x <- Hald[, -1] 
y <- Hald[, 1] 

model <- lm(y ~ X1 + X2 + X3 + X4, data = as.data.frame(Hald))

# Collinearity Diagnostic measures in matrix of 0 or 1
imcdiag(model, all = TRUE)

## 
## Call:
## imcdiag(mod = model, all = TRUE)
## 
## 
## All Individual Multicollinearity Diagnostics in 0 or 1 
## 
##    VIF TOL Wi Fi Leamer CVIF Klein IND1 IND2
## X1   1   1  1  1      0    0     0    1    1
## X2   1   1  1  1      1    0     1    1    1
## X3   1   1  1  1      0    0     0    1    1
## X4   1   1  1  1      1    0     1    1    1
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test
## 
## X1 , X2 , X3 , X4 , coefficient(s) are non-significant may be due to multicollinearity
## 
## R-square of y on all x: 0.9824 
## 
## * use method argument to check which regressors may be the reason of collinearity
## ===================================

imcdiag(model, method = "VIF", all = TRUE)

## 
## Call:
## imcdiag(mod = model, method = "VIF", all = TRUE)
## 
## 
##  VIF Multicollinearity Diagnostics
## 
##         VIF detection
## X1  38.4962         1
## X2 254.4232         1
## X3  46.8684         1
## X4 282.5129         1
## 
## All Individual Multicollinearity Diagnostics in 0 or 1 
## 
##    VIF TOL Wi Fi Leamer CVIF Klein IND1 IND2
## X1   1   1  1  1      0    0     0    1    1
## X2   1   1  1  1      1    0     1    1    1
## X3   1   1  1  1      0    0     0    1    1
## X4   1   1  1  1      1    0     1    1    1
## 
## Multicollinearity may be due to X1 X2 X3 X4 regressors
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test
## 
## ===================================

Testes

Shapiro-Wilk normality test

y1 <- (c(tbl))  # onde y é a nova tabela.
y2 <- rnorm(y1)
shapiro.test(y2)

## 
##  Shapiro-Wilk normality test
## 
## data:  y2
## W = 0.98162, p-value = 0.7618

Interpretação W=1.0 is the “perfect fit”. The W statistic is > pretty much the Pearson correlation applied to the curve drawn by > qqnorm(). In case of the null hypothesis of shapiro.test, a p-value <= 0.05 would reject the null hypothesis that the samples come from normal distribution.

Bartlett

Ref: https://www.rdocumentation.org/packages/REdaS/versions/0.9.3/topics/Bartlett-Sphericity

From the output we can see that the p-value of 0.2371 is not less than the significance level of 0.05. This means we cannot reject the null hypothesis that the variance is the same for all treatment groups. This means that there is no evidence to suggest that the variance in plant growth is different for the three treatment groups.

For Factor Analysis to be recommended suitable, the Bartlett’s Test of Sphericity must be less than 0.05.

bartlett.test(tbl)

## 
##  Bartlett test of homogeneity of variances
## 
## data:  tbl
## Bartlett's K-squared = 103.62, df = 38, p-value = 5.39e-08

datamatrix <- data.frame("A" = rnorm(100), "B" = rnorm(100), "C" = rnorm(100))
head(datamatrix)

##             A          B          C
## 1  0.94134839  1.1163323  0.2865841
## 2 -0.90638169  1.6050925  0.4783473
## 3 -0.46250610 -0.9855091  1.1257085
## 4 -0.03222622 -0.1194222 -1.5910541
## 5  0.65481876 -0.5670538  1.0443353
## 6  0.42254332  1.1800630  0.7532627

# correlation matrix
cor(datamatrix)

##             A         B           C
## A  1.00000000 0.1279772 -0.06272345
## B  0.12797718 1.0000000  0.18364892
## C -0.06272345 0.1836489  1.00000000

Esfericidade

# bartlett's test
bart_spher(datamatrix)

##  Bartlett's Test of Sphericity
## 
## Call: bart_spher(x = datamatrix)
## 
##      X2 = 5.701
##      df = 3
## p-value = .12708

#capture.output(Bts,file="Bts2.txt")

corMat <- cor(tbl1)
bart_spher(corMat)

##  Bartlett's Test of Sphericity
## 
## Call: bart_spher(x = corMat)
## 
##      X2 = 2400.262
##      df = 465
## p-value < 2.22e-16

One-sample Kolmogorov-Smirnov test

ks.test(y2, "pnorm",mean = mean(y2), sd = sqrt(var(y2)))

## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  y2
## D = 0.091997, p-value = 0.8662
## alternative hypothesis: two-sided

It means that there is very little evidence that they come from different distributions. With a p-value as high as .2653, it is reasonable to assume that they come from the same distribution.

Assim, não há evidências para rejeitar a hipótese de normalidade dos dados.

KMO & MSA

O teste Kaiser-Meyer-Olkin (KMO) verifica a adequação da amostra que varia de 0,0 `a 1,0. Kaiser (1974) sugere que menos de 0,5 é inaceitável, entre 0,5 e 0,6 é miserável, de 0,6 a 0,7 é medíocre, de 0,7 a 0,8 é moderado (middling), de 0,8 a 0,9 é meritório e de 0,9 a 1,0 é maravilhoso. O MSA (medidas individuais de adequação da amostra para cada item) abaixo de 0,5 indica que o item não pertence ao grupo e pode ser removido da AFE.

Para os testes KMO e MSA, fez-se uso da metodologia de Nakazawa proposta para o R como sugerida por Bates (2011).

x <- subset(tbl1, complete.cases(tbl1)) 

r <- cor(x)
r2 <- r^2
i <- solve(r)
d <- diag(i)
p2 <- (-i/sqrt(outer(d, d)))^2
diag(r2) <- diag(p2) <- 0
KMO <- sum(r2) / (sum(r2) + sum(p2))
KMO

## [1] 0.505774

MSA <- colSums(r2) / (colSums(r2) + colSums(p2))
MSA

##        V1        V2        V3        V4        V5        V8        V9       V10 
## 0.4609946 0.4477631 0.6733416 0.5670581 0.5489502 0.2767630 0.2763022 0.4998019 
##       V11       V12       V13       V14       V15       V16       V17       V18 
## 0.6053480 0.5109641 0.4202440 0.3865621 0.8186567 0.3409139 0.7342401 0.3208846 
##       V19       V20       V21       V22       V23       V26       V27       V29 
## 0.4849010 0.6136971 0.5262870 0.7225698 0.3658816 0.5456820 0.6818296 0.6916242 
##       V30       V31       V32       V33       V35       V36       V37 
## 0.4204208 0.6254519 0.5232902 0.5188519 0.4685922 0.4805286 0.3222689

Analise Fatorial Exploratória

fa <- factanal(tbl, factors = 3, scores = 'regression')
fa

## 
## Call:
## factanal(x = tbl, factors = 3, scores = "regression")
## 
## Uniquenesses:
##    V1    V2    V3    V4    V5    V6    V7    V8    V9   V10   V11   V12   V13 
## 0.371 0.601 0.518 0.416 0.454 0.592 0.605 0.671 0.722 0.505 0.507 0.709 0.665 
##   V14   V15   V16   V17   V18   V19   V20   V21   V22   V23   V24   V25   V26 
## 0.653 0.325 0.641 0.358 0.575 0.684 0.409 0.412 0.404 0.530 0.417 0.656 0.387 
##   V27   V28   V29   V30   V31   V32   V33   V34   V35   V36   V37   V38   V39 
## 0.464 0.798 0.345 0.768 0.521 0.096 0.023 0.888 0.823 0.610 0.891 0.983 0.905 
## 
## Loadings:
##     Factor1 Factor2 Factor3
## V1  -0.665  -0.300  -0.310 
## V2   0.528   0.337         
## V3   0.559   0.112   0.397 
## V4  -0.655  -0.209  -0.335 
## V5   0.673   0.219   0.214 
## V6   0.554   0.279   0.154 
## V7   0.213   0.345   0.481 
## V8   0.566                 
## V9   0.495  -0.182         
## V10          0.655   0.247 
## V11  0.275   0.614   0.201 
## V12          0.517  -0.147 
## V13  0.576                 
## V14  0.557   0.168         
## V15          0.782   0.250 
## V16  0.591                 
## V17  0.265   0.751         
## V18          0.636   0.135 
## V19  0.454  -0.238   0.230 
## V20  0.539   0.316   0.448 
## V21  0.424   0.610   0.188 
## V22  0.262   0.701   0.191 
## V23  0.647   0.179   0.141 
## V24          0.750   0.132 
## V25  0.573                 
## V26  0.617   0.426   0.225 
## V27  0.312  -0.121   0.651 
## V28 -0.159   0.227   0.354 
## V29  0.257   0.188   0.744 
## V30          0.257   0.407 
## V31  0.320   0.276   0.549 
## V32          0.151   0.938 
## V33                  0.983 
## V34  0.230           0.232 
## V35                  0.407 
## V36  0.317           0.533 
## V37  0.322                 
## V38                        
## V39 -0.205   0.113  -0.199 
## 
##                Factor1 Factor2 Factor3
## SS loadings      6.459   5.356   5.284
## Proportion Var   0.166   0.137   0.135
## Cumulative Var   0.166   0.303   0.438
## 
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 839.59 on 627 degrees of freedom.
## The p-value is 2.49e-08

corMat <- cor(tbl1)
faPC <- fa(r=corMat, nfactors = 3, n.obs = 46, rotate = 'varimax')
fa.diagram(faPC)

Para o teste de Bartlett vide arquivo “TESTES”.

Para a determinação do número de fatores procedeu-se ao teste ‘PARALLEL’ o qual indica 4 fatores.

fa.parallel(tbl)

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Parallel analysis suggests that the number of factors =  3  and the number of components =  3

Roteiro 2020 met quant 1

Victor Ivamoto

2020/09/16