INTRODUÇÃO

Exemplo tabela 10.5 de Fávero e Belfiore (2017, p.395). Notas de 100 alunos nas disciplinas Finanças, Custos, Marketing e Atuária.

Obs.: Para visualização das referências, ver ementa da disciplina.

ESTATÍSTICAS INICIAIS

# Abrindo e tratando a base de dados
library(haven)
exemp4 <- read_dta("D:/dados/exemp4.dta")
View(exemp4)

attach(exemp4)
library(psych)

X <- cbind(financas, custos, marketing, atuaria )


# Estatísticas Iniciais

summary(X)

##     financas          custos         marketing         atuaria      
##  Min.   : 0.600   Min.   : 1.851   Min.   : 1.000   Min.   : 1.700  
##  1st Qu.: 3.100   1st Qu.: 2.901   1st Qu.: 3.000   1st Qu.: 3.200  
##  Median : 5.800   Median : 4.000   Median : 6.000   Median : 5.000  
##  Mean   : 5.834   Mean   : 4.714   Mean   : 5.667   Mean   : 5.314  
##  3rd Qu.: 9.000   3rd Qu.: 6.000   3rd Qu.: 8.000   3rd Qu.: 7.025  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000

cor(X)

##             financas      custos    marketing     atuaria
## financas   1.0000000 0.755844573 -0.029666801  0.71087119
## custos     0.7558446 1.000000000  0.003064165  0.80907469
## marketing -0.0296668 0.003064165  1.000000000 -0.04429093
## atuaria    0.7108712 0.809074692 -0.044290934  1.00000000

KMO E TESTE DE BARTLETT

Teste de Bartlett

# Passo a Passo
teste_Bartlett<-function(X) {
  n <- nrow(X)
  p <- ncol(X)
  qui_quadrado <- -(n-1-(2*p+5)/6)*log(det(cor(X)))
  gl <- p*(p-1)/2
  valor_p <- pchisq(qui_quadrado, gl, lower.tail=FALSE)
  
return(list(qui_quadrado=qui_quadrado, graus_de_liberdade=gl,valor_p=valor_p))
}
teste_Bartlett(X)

## $qui_quadrado
## [1] 192.3353
## 
## $graus_de_liberdade
## [1] 6
## 
## $valor_p
## [1] 8.109387e-39

# Utilizando o comando direto
cortest.bartlett(X)

## R was not square, finding R from data

## $chisq
## [1] 192.3353
## 
## $p.value
## [1] 8.109387e-39
## 
## $df
## [1] 6

KMO

# Passo a passo
kmo = function(X)
{
 
  r = cor(X)
  r2 = r^2 
  i = solve(r) 
  d = diag(i) 
  p2 = (-i/sqrt(outer(d, d)))^2 
  diag(r2) <- diag(p2) <- 0 
  kmo = sum(r2)/(sum(r2)+sum(p2))
  
  return(list(KMO=kmo))
}

kmo(X)

## $KMO
## [1] 0.7370506

# Utilizando o comando direto
KMO(X)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = X)
## Overall MSA =  0.74
## MSA for each item = 
##  financas    custos marketing   atuaria 
##      0.81      0.69      0.21      0.73

AUTOVALORES E AUTOVETORES

pca1 <- princomp(X, scores=TRUE, cor=TRUE)
summary(pca1)

## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4
## Standard deviation     1.5871317 1.0001978 0.54566770 0.42762595
## Proportion of Variance 0.6297468 0.2500989 0.07443831 0.04571599
## Cumulative Proportion  0.6297468 0.8798457 0.95428401 1.00000000

loadings(pca1)

## 
## Loadings:
##           Comp.1 Comp.2 Comp.3 Comp.4
## financas   0.564         0.801  0.201
## custos     0.589        -0.220 -0.776
## marketing         0.999              
## atuaria    0.578        -0.557  0.596
## 
##                Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings      1.00   1.00   1.00   1.00
## Proportion Var   0.25   0.25   0.25   0.25
## Cumulative Var   0.25   0.50   0.75   1.00

plot(pca1)

screeplot(pca1, type="line", main="Scree Plot")

Comando Alternativo

library(FactoMineR)
pca3 = PCA(X, graph = F)
# matriz com autovalores 
pca3$eig

##        eigenvalue percentage of variance cumulative percentage of variance
## comp 1  2.5189871              62.974678                          62.97468
## comp 2  1.0003957              25.009893                          87.98457
## comp 3  0.2977532               7.443831                          95.42840
## comp 4  0.1828639               4.571599                         100.00000

ANÁLISE FATORIAL POR COMPONENTES PRINCIPAIS

Inicial

pc<-principal(X,nfactors=2,score=TRUE, rotate="none")
pc

## Principal Components Analysis
## Call: principal(r = X, nfactors = 2, rotate = "none", scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
##             PC1   PC2   h2      u2 com
## financas   0.90  0.01 0.80 0.19833   1
## custos     0.93  0.05 0.88 0.12462   1
## marketing -0.04  1.00 1.00 0.00033   1
## atuaria    0.92 -0.01 0.84 0.15733   1
## 
##                        PC1  PC2
## SS loadings           2.52 1.00
## Proportion Var        0.63 0.25
## Cumulative Var        0.63 0.88
## Proportion Explained  0.72 0.28
## Cumulative Proportion 0.72 1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.06 
##  with the empirical chi square  4.25  with prob <  NA 
## 
## Fit based upon off diagonal values = 0.99

head(pc$scores)

##              PC1        PC2
## [1,]  0.01560443 -1.6646876
## [2,] -1.07611938  1.5029204
## [3,] -0.59955112 -0.6034488
## [4,]  1.34559072  0.8868034
## [5,] -0.97842901 -0.9216180
## [6,]  1.97899529 -1.5527807

summary(pc$scores)

##       PC1                PC2         
##  Min.   :-1.47128   Min.   :-1.6647  
##  1st Qu.:-0.88209   1st Qu.:-0.9290  
##  Median :-0.09511   Median : 0.1235  
##  Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.80758   3rd Qu.: 0.8156  
##  Max.   : 1.97900   Max.   : 1.6242

Scores fatoriais a partir de uma regressão

Este procedimento pode ser realizado também para o caso dos fatores rotacionados.

# Criando as variáveis padronizadas
zfinancas=(financas-mean(financas))/sd(financas)
zcustos=(custos-mean(custos))/sd(custos)
zmarketing=(marketing-mean(marketing))/sd(marketing)
zatuaria=(atuaria-mean(atuaria))/sd(atuaria)

# nova base

nvar_pc<-(pc$scores)
base_nova <- data.frame(exemp4$estudante,nvar_pc, zfinancas, zcustos, zmarketing, zatuaria)

# regressão

summary(lm(PC1~zfinancas+ zcustos+ zmarketing+ zatuaria, data=base_nova))

## Warning in summary.lm(lm(PC1 ~ zfinancas + zcustos + zmarketing +
## zatuaria, : essentially perfect fit: summary may be unreliable

## 
## Call:
## lm(formula = PC1 ~ zfinancas + zcustos + zmarketing + zatuaria, 
##     data = base_nova)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.355e-15 -4.640e-17  1.370e-17  8.890e-17  1.622e-15 
## 
## Coefficients:
##               Estimate Std. Error    t value Pr(>|t|)    
## (Intercept) -6.129e-17  4.101e-17 -1.495e+00    0.138    
## zfinancas    3.554e-01  6.518e-17  5.453e+15   <2e-16 ***
## zcustos      3.709e-01  7.818e-17  4.745e+15   <2e-16 ***
## zmarketing  -1.682e-02  4.136e-17 -4.066e+14   <2e-16 ***
## zatuaria     3.644e-01  7.276e-17  5.008e+15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.101e-16 on 95 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.472e+32 on 4 and 95 DF,  p-value: < 2.2e-16

summary(lm(PC2~zfinancas+ zcustos+ zmarketing+ zatuaria, data=base_nova))

## Warning in summary.lm(lm(PC2 ~ zfinancas + zcustos + zmarketing +
## zatuaria, : essentially perfect fit: summary may be unreliable

## 
## Call:
## lm(formula = PC2 ~ zfinancas + zcustos + zmarketing + zatuaria, 
##     data = base_nova)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.437e-15 -9.290e-17  4.100e-18  9.390e-17  2.276e-15 
## 
## Coefficients:
##               Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)  1.945e-16  4.533e-17  4.290e+00  4.3e-05 ***
## zfinancas    6.788e-03  7.206e-17  9.420e+13  < 2e-16 ***
## zcustos      4.869e-02  8.642e-17  5.634e+14  < 2e-16 ***
## zmarketing   9.985e-01  4.573e-17  2.184e+16  < 2e-16 ***
## zatuaria    -1.010e-02  8.044e-17 -1.256e+14  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.533e-16 on 95 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.204e+32 on 4 and 95 DF,  p-value: < 2.2e-16

Cargas fatoriais a partir de uma regressão

Este procedimento pode ser realizado também para o caso dos fatores rotacionados.

summary(lm(zfinancas ~ PC1+PC2, data=base_nova))

## 
## Call:
## lm(formula = zfinancas ~ PC1 + PC2, data = base_nova)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.23047 -0.33766 -0.01578  0.30466  0.89756 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.062e-17  4.499e-02    0.00    1.000    
## PC1          8.953e-01  4.522e-02   19.80   <2e-16 ***
## PC2          6.790e-03  4.522e-02    0.15    0.881    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4499 on 97 degrees of freedom
## Multiple R-squared:  0.8017, Adjusted R-squared:  0.7976 
## F-statistic:   196 on 2 and 97 DF,  p-value: < 2.2e-16

summary(lm(zcustos ~ PC1+PC2, data=base_nova))

## 
## Call:
## lm(formula = zcustos ~ PC1 + PC2, data = base_nova)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.17007 -0.21309  0.01527  0.19362  1.59222 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.384e-16  3.566e-02   0.000    1.000    
## PC1         9.343e-01  3.584e-02  26.068   <2e-16 ***
## PC2         4.871e-02  3.584e-02   1.359    0.177    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3566 on 97 degrees of freedom
## Multiple R-squared:  0.8754, Adjusted R-squared:  0.8728 
## F-statistic: 340.7 on 2 and 97 DF,  p-value: < 2.2e-16

summary(lm(zmarketing ~ PC1+PC2, data=base_nova))

## 
## Call:
## lm(formula = zmarketing ~ PC1 + PC2, data = base_nova)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.077957 -0.010697 -0.001659  0.009006  0.060179 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.008e-16  1.837e-03    0.00        1    
## PC1         -4.237e-02  1.846e-03  -22.95   <2e-16 ***
## PC2          9.989e-01  1.846e-03  541.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01837 on 97 degrees of freedom
## Multiple R-squared:  0.9997, Adjusted R-squared:  0.9997 
## F-statistic: 1.467e+05 on 2 and 97 DF,  p-value: < 2.2e-16

summary(lm(zatuaria ~ PC1+PC2, data=base_nova))

## 
## Call:
## lm(formula = zatuaria ~ PC1 + PC2, data = base_nova)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02882 -0.23245 -0.01945  0.20069  1.10599 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.791e-17  4.007e-02   0.000    1.000    
## PC1          9.179e-01  4.027e-02  22.792   <2e-16 ***
## PC2         -1.010e-02  4.027e-02  -0.251    0.802    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4007 on 97 degrees of freedom
## Multiple R-squared:  0.8427, Adjusted R-squared:  0.8394 
## F-statistic: 259.8 on 2 and 97 DF,  p-value: < 2.2e-16

RANKING DE DESEMPENHO

Este procedimento pode ser realizado também para o caso dos fatores rotacionados.

base_nova$ranking<-0.6297468*base_nova$PC1+0.25009893*base_nova$PC2

head(base_nova)

##   exemp4.estudante         PC1        PC2   zfinancas    zcustos
## 1         Gabriela  0.01560443 -1.6646876 -0.01088779 -0.2904900
## 2      Luiz Felipe -1.07611938  1.5029204 -0.87551076 -0.6972874
## 3         Patrícia -0.59955112 -0.6034488 -0.87551076 -0.2904900
## 4          Gustavo  1.34559072  0.8868034  1.33408107  1.3366998
## 5          Letícia -0.97842901 -0.9216180 -0.77944149 -1.1040849
## 6           Ovídio  1.97899529 -1.5527807  1.33408107  2.1502946
##   zmarketing   zatuaria    ranking
## 1 -1.6501184  0.2729721 -0.4065098
## 2  1.5317333 -1.3187018 -0.3018040
## 3 -0.5895012 -0.5228648 -0.5284873
## 4  0.8246552  1.0688090  1.0691700
## 5 -0.8723324 -0.8411996 -0.8466582
## 6 -1.6501184  1.8646459  0.8579172

Fatores Rotacionados

pc_rot<-principal(X,nfactors=2,score=TRUE, rotate="varimax")
pc_rot

## Principal Components Analysis
## Call: principal(r = X, nfactors = 2, rotate = "varimax", scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
##            RC1   RC2   h2      u2 com
## financas  0.89 -0.04 0.80 0.19833   1
## custos    0.94  0.00 0.88 0.12462   1
## marketing 0.01  1.00 1.00 0.00033   1
## atuaria   0.92 -0.06 0.84 0.15733   1
## 
##                        RC1  RC2
## SS loadings           2.52 1.00
## Proportion Var        0.63 0.25
## Cumulative Var        0.63 0.88
## Proportion Explained  0.71 0.29
## Cumulative Proportion 0.71 1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.06 
##  with the empirical chi square  4.25  with prob <  NA 
## 
## Fit based upon off diagonal values = 0.99

head(pc_rot$scores)

##              RC1        RC2
## [1,] -0.06783908 -1.6633780
## [2,] -0.99945013  1.5549605
## [3,] -0.62903893 -0.5726448
## [4,]  1.38834113  0.8182564
## [5,] -1.02338545 -0.8714272
## [6,]  1.89869290 -1.6500047

# Biplot of score variables
biplot(pca1)

# Scores of the components
pca1$scores[1:10,]

##            Comp.1     Comp.2      Comp.3      Comp.4
##  [1,]  0.02489105 -1.6734050 -0.09676231  0.31741272
##  [2,] -1.71654751  1.5107907  0.18738577 -0.35733903
##  [3,] -0.95636042 -0.6066089 -0.34742618 -0.28872991
##  [4,]  2.14638862  0.8914472  0.17932857 -0.09777575
##  [5,] -1.56071892 -0.9264442  0.08818270  0.16272173
##  [6,]  3.15674961 -1.5609120 -0.44534545 -0.36165412
##  [7,] -0.17735479  0.8329039 -0.17100253 -0.18434043
##  [8,]  0.38622192  0.1399065 -0.38043288 -0.26771881
##  [9,] -0.44799674 -0.6004909  0.37420473 -0.10742138
## [10,] -0.41168509 -0.6000540  0.42574974 -0.09447078

APÊNDICE: ALPHA DE CRONBACH

alpha(X)

## Warning in alpha(X): Some items were negatively correlated with the total scale and probably 
## should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option

## Some items ( marketing ) were negatively correlated with the total scale and 
## probably should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option

## 
## Reliability analysis   
## Call: alpha(x = X)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N   ase mean sd median_r
##       0.68       0.7    0.76      0.37 2.3 0.053  5.4  2     0.36
## 
##  lower alpha upper     95% confidence boundaries
## 0.58 0.68 0.79 
## 
##  Reliability if an item is dropped:
##           raw_alpha std.alpha G6(smc) average_r  S/N alpha se  var.r
## financas       0.47      0.51    0.63      0.26 1.03    0.098 0.2300
## custos         0.44      0.45    0.54      0.21 0.81    0.099 0.1865
## marketing      0.90      0.90    0.87      0.76 9.43    0.018 0.0024
## atuaria        0.48      0.49    0.58      0.24 0.96    0.093 0.1975
##             med.r
## financas   0.0031
## custos    -0.0297
## marketing  0.7558
## atuaria    0.0031
## 
##  Item statistics 
##             n raw.r std.r  r.cor r.drop mean  sd
## financas  100  0.85  0.84  0.804  0.653  5.8 3.1
## custos    100  0.87  0.89  0.904  0.754  4.7 2.5
## marketing 100  0.33  0.32 -0.025 -0.027  5.7 2.8
## atuaria   100  0.84  0.85  0.851  0.691  5.3 2.5

MÉTODOS QUANTITATIVOS

6)ANÁLISE FATORIAL POR COMPONENTES PRINCIPAIS

Anderson Moreira Aristides dos Santos

INTRODUÇÃO

ESTATÍSTICAS INICIAIS

KMO E TESTE DE BARTLETT

Teste de Bartlett

KMO

AUTOVALORES E AUTOVETORES

Comando Alternativo

ANÁLISE FATORIAL POR COMPONENTES PRINCIPAIS

Inicial

Scores fatoriais a partir de uma regressão

Cargas fatoriais a partir de uma regressão

RANKING DE DESEMPENHO

Fatores Rotacionados

APÊNDICE: ALPHA DE CRONBACH