Exemplo tabela 10.5 de Fávero e Belfiore (2017, p.395). Notas de 100 alunos nas disciplinas Finanças, Custos, Marketing e Atuária.
Obs.: Para visualização das referências, ver ementa da disciplina.
# Abrindo e tratando a base de dados
library(haven)
exemp4 <- read_dta("D:/dados/exemp4.dta")
View(exemp4)
attach(exemp4)
library(psych)
X <- cbind(financas, custos, marketing, atuaria )
# Estatísticas Iniciais
summary(X)
## financas custos marketing atuaria
## Min. : 0.600 Min. : 1.851 Min. : 1.000 Min. : 1.700
## 1st Qu.: 3.100 1st Qu.: 2.901 1st Qu.: 3.000 1st Qu.: 3.200
## Median : 5.800 Median : 4.000 Median : 6.000 Median : 5.000
## Mean : 5.834 Mean : 4.714 Mean : 5.667 Mean : 5.314
## 3rd Qu.: 9.000 3rd Qu.: 6.000 3rd Qu.: 8.000 3rd Qu.: 7.025
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
cor(X)
## financas custos marketing atuaria
## financas 1.0000000 0.755844573 -0.029666801 0.71087119
## custos 0.7558446 1.000000000 0.003064165 0.80907469
## marketing -0.0296668 0.003064165 1.000000000 -0.04429093
## atuaria 0.7108712 0.809074692 -0.044290934 1.00000000
# Passo a Passo
teste_Bartlett<-function(X) {
n <- nrow(X)
p <- ncol(X)
qui_quadrado <- -(n-1-(2*p+5)/6)*log(det(cor(X)))
gl <- p*(p-1)/2
valor_p <- pchisq(qui_quadrado, gl, lower.tail=FALSE)
return(list(qui_quadrado=qui_quadrado, graus_de_liberdade=gl,valor_p=valor_p))
}
teste_Bartlett(X)
## $qui_quadrado
## [1] 192.3353
##
## $graus_de_liberdade
## [1] 6
##
## $valor_p
## [1] 8.109387e-39
# Utilizando o comando direto
cortest.bartlett(X)
## R was not square, finding R from data
## $chisq
## [1] 192.3353
##
## $p.value
## [1] 8.109387e-39
##
## $df
## [1] 6
# Passo a passo
kmo = function(X)
{
r = cor(X)
r2 = r^2
i = solve(r)
d = diag(i)
p2 = (-i/sqrt(outer(d, d)))^2
diag(r2) <- diag(p2) <- 0
kmo = sum(r2)/(sum(r2)+sum(p2))
return(list(KMO=kmo))
}
kmo(X)
## $KMO
## [1] 0.7370506
# Utilizando o comando direto
KMO(X)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = X)
## Overall MSA = 0.74
## MSA for each item =
## financas custos marketing atuaria
## 0.81 0.69 0.21 0.73
pca1 <- princomp(X, scores=TRUE, cor=TRUE)
summary(pca1)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 1.5871317 1.0001978 0.54566770 0.42762595
## Proportion of Variance 0.6297468 0.2500989 0.07443831 0.04571599
## Cumulative Proportion 0.6297468 0.8798457 0.95428401 1.00000000
loadings(pca1)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4
## financas 0.564 0.801 0.201
## custos 0.589 -0.220 -0.776
## marketing 0.999
## atuaria 0.578 -0.557 0.596
##
## Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings 1.00 1.00 1.00 1.00
## Proportion Var 0.25 0.25 0.25 0.25
## Cumulative Var 0.25 0.50 0.75 1.00
plot(pca1)
screeplot(pca1, type="line", main="Scree Plot")
library(FactoMineR)
pca3 = PCA(X, graph = F)
# matriz com autovalores
pca3$eig
## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 2.5189871 62.974678 62.97468
## comp 2 1.0003957 25.009893 87.98457
## comp 3 0.2977532 7.443831 95.42840
## comp 4 0.1828639 4.571599 100.00000
pc<-principal(X,nfactors=2,score=TRUE, rotate="none")
pc
## Principal Components Analysis
## Call: principal(r = X, nfactors = 2, rotate = "none", scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
## PC1 PC2 h2 u2 com
## financas 0.90 0.01 0.80 0.19833 1
## custos 0.93 0.05 0.88 0.12462 1
## marketing -0.04 1.00 1.00 0.00033 1
## atuaria 0.92 -0.01 0.84 0.15733 1
##
## PC1 PC2
## SS loadings 2.52 1.00
## Proportion Var 0.63 0.25
## Cumulative Var 0.63 0.88
## Proportion Explained 0.72 0.28
## Cumulative Proportion 0.72 1.00
##
## Mean item complexity = 1
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.06
## with the empirical chi square 4.25 with prob < NA
##
## Fit based upon off diagonal values = 0.99
head(pc$scores)
## PC1 PC2
## [1,] 0.01560443 -1.6646876
## [2,] -1.07611938 1.5029204
## [3,] -0.59955112 -0.6034488
## [4,] 1.34559072 0.8868034
## [5,] -0.97842901 -0.9216180
## [6,] 1.97899529 -1.5527807
summary(pc$scores)
## PC1 PC2
## Min. :-1.47128 Min. :-1.6647
## 1st Qu.:-0.88209 1st Qu.:-0.9290
## Median :-0.09511 Median : 0.1235
## Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.80758 3rd Qu.: 0.8156
## Max. : 1.97900 Max. : 1.6242
Este procedimento pode ser realizado também para o caso dos fatores rotacionados.
# Criando as variáveis padronizadas
zfinancas=(financas-mean(financas))/sd(financas)
zcustos=(custos-mean(custos))/sd(custos)
zmarketing=(marketing-mean(marketing))/sd(marketing)
zatuaria=(atuaria-mean(atuaria))/sd(atuaria)
# nova base
nvar_pc<-(pc$scores)
base_nova <- data.frame(exemp4$estudante,nvar_pc, zfinancas, zcustos, zmarketing, zatuaria)
# regressão
summary(lm(PC1~zfinancas+ zcustos+ zmarketing+ zatuaria, data=base_nova))
## Warning in summary.lm(lm(PC1 ~ zfinancas + zcustos + zmarketing +
## zatuaria, : essentially perfect fit: summary may be unreliable
##
## Call:
## lm(formula = PC1 ~ zfinancas + zcustos + zmarketing + zatuaria,
## data = base_nova)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.355e-15 -4.640e-17 1.370e-17 8.890e-17 1.622e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.129e-17 4.101e-17 -1.495e+00 0.138
## zfinancas 3.554e-01 6.518e-17 5.453e+15 <2e-16 ***
## zcustos 3.709e-01 7.818e-17 4.745e+15 <2e-16 ***
## zmarketing -1.682e-02 4.136e-17 -4.066e+14 <2e-16 ***
## zatuaria 3.644e-01 7.276e-17 5.008e+15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.101e-16 on 95 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.472e+32 on 4 and 95 DF, p-value: < 2.2e-16
summary(lm(PC2~zfinancas+ zcustos+ zmarketing+ zatuaria, data=base_nova))
## Warning in summary.lm(lm(PC2 ~ zfinancas + zcustos + zmarketing +
## zatuaria, : essentially perfect fit: summary may be unreliable
##
## Call:
## lm(formula = PC2 ~ zfinancas + zcustos + zmarketing + zatuaria,
## data = base_nova)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.437e-15 -9.290e-17 4.100e-18 9.390e-17 2.276e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.945e-16 4.533e-17 4.290e+00 4.3e-05 ***
## zfinancas 6.788e-03 7.206e-17 9.420e+13 < 2e-16 ***
## zcustos 4.869e-02 8.642e-17 5.634e+14 < 2e-16 ***
## zmarketing 9.985e-01 4.573e-17 2.184e+16 < 2e-16 ***
## zatuaria -1.010e-02 8.044e-17 -1.256e+14 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.533e-16 on 95 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.204e+32 on 4 and 95 DF, p-value: < 2.2e-16
Este procedimento pode ser realizado também para o caso dos fatores rotacionados.
summary(lm(zfinancas ~ PC1+PC2, data=base_nova))
##
## Call:
## lm(formula = zfinancas ~ PC1 + PC2, data = base_nova)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23047 -0.33766 -0.01578 0.30466 0.89756
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.062e-17 4.499e-02 0.00 1.000
## PC1 8.953e-01 4.522e-02 19.80 <2e-16 ***
## PC2 6.790e-03 4.522e-02 0.15 0.881
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4499 on 97 degrees of freedom
## Multiple R-squared: 0.8017, Adjusted R-squared: 0.7976
## F-statistic: 196 on 2 and 97 DF, p-value: < 2.2e-16
summary(lm(zcustos ~ PC1+PC2, data=base_nova))
##
## Call:
## lm(formula = zcustos ~ PC1 + PC2, data = base_nova)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.17007 -0.21309 0.01527 0.19362 1.59222
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.384e-16 3.566e-02 0.000 1.000
## PC1 9.343e-01 3.584e-02 26.068 <2e-16 ***
## PC2 4.871e-02 3.584e-02 1.359 0.177
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3566 on 97 degrees of freedom
## Multiple R-squared: 0.8754, Adjusted R-squared: 0.8728
## F-statistic: 340.7 on 2 and 97 DF, p-value: < 2.2e-16
summary(lm(zmarketing ~ PC1+PC2, data=base_nova))
##
## Call:
## lm(formula = zmarketing ~ PC1 + PC2, data = base_nova)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.077957 -0.010697 -0.001659 0.009006 0.060179
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.008e-16 1.837e-03 0.00 1
## PC1 -4.237e-02 1.846e-03 -22.95 <2e-16 ***
## PC2 9.989e-01 1.846e-03 541.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01837 on 97 degrees of freedom
## Multiple R-squared: 0.9997, Adjusted R-squared: 0.9997
## F-statistic: 1.467e+05 on 2 and 97 DF, p-value: < 2.2e-16
summary(lm(zatuaria ~ PC1+PC2, data=base_nova))
##
## Call:
## lm(formula = zatuaria ~ PC1 + PC2, data = base_nova)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.02882 -0.23245 -0.01945 0.20069 1.10599
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.791e-17 4.007e-02 0.000 1.000
## PC1 9.179e-01 4.027e-02 22.792 <2e-16 ***
## PC2 -1.010e-02 4.027e-02 -0.251 0.802
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4007 on 97 degrees of freedom
## Multiple R-squared: 0.8427, Adjusted R-squared: 0.8394
## F-statistic: 259.8 on 2 and 97 DF, p-value: < 2.2e-16
Este procedimento pode ser realizado também para o caso dos fatores rotacionados.
base_nova$ranking<-0.6297468*base_nova$PC1+0.25009893*base_nova$PC2
head(base_nova)
## exemp4.estudante PC1 PC2 zfinancas zcustos
## 1 Gabriela 0.01560443 -1.6646876 -0.01088779 -0.2904900
## 2 Luiz Felipe -1.07611938 1.5029204 -0.87551076 -0.6972874
## 3 Patrícia -0.59955112 -0.6034488 -0.87551076 -0.2904900
## 4 Gustavo 1.34559072 0.8868034 1.33408107 1.3366998
## 5 Letícia -0.97842901 -0.9216180 -0.77944149 -1.1040849
## 6 Ovídio 1.97899529 -1.5527807 1.33408107 2.1502946
## zmarketing zatuaria ranking
## 1 -1.6501184 0.2729721 -0.4065098
## 2 1.5317333 -1.3187018 -0.3018040
## 3 -0.5895012 -0.5228648 -0.5284873
## 4 0.8246552 1.0688090 1.0691700
## 5 -0.8723324 -0.8411996 -0.8466582
## 6 -1.6501184 1.8646459 0.8579172
pc_rot<-principal(X,nfactors=2,score=TRUE, rotate="varimax")
pc_rot
## Principal Components Analysis
## Call: principal(r = X, nfactors = 2, rotate = "varimax", scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 h2 u2 com
## financas 0.89 -0.04 0.80 0.19833 1
## custos 0.94 0.00 0.88 0.12462 1
## marketing 0.01 1.00 1.00 0.00033 1
## atuaria 0.92 -0.06 0.84 0.15733 1
##
## RC1 RC2
## SS loadings 2.52 1.00
## Proportion Var 0.63 0.25
## Cumulative Var 0.63 0.88
## Proportion Explained 0.71 0.29
## Cumulative Proportion 0.71 1.00
##
## Mean item complexity = 1
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.06
## with the empirical chi square 4.25 with prob < NA
##
## Fit based upon off diagonal values = 0.99
head(pc_rot$scores)
## RC1 RC2
## [1,] -0.06783908 -1.6633780
## [2,] -0.99945013 1.5549605
## [3,] -0.62903893 -0.5726448
## [4,] 1.38834113 0.8182564
## [5,] -1.02338545 -0.8714272
## [6,] 1.89869290 -1.6500047
# Biplot of score variables
biplot(pca1)
# Scores of the components
pca1$scores[1:10,]
## Comp.1 Comp.2 Comp.3 Comp.4
## [1,] 0.02489105 -1.6734050 -0.09676231 0.31741272
## [2,] -1.71654751 1.5107907 0.18738577 -0.35733903
## [3,] -0.95636042 -0.6066089 -0.34742618 -0.28872991
## [4,] 2.14638862 0.8914472 0.17932857 -0.09777575
## [5,] -1.56071892 -0.9264442 0.08818270 0.16272173
## [6,] 3.15674961 -1.5609120 -0.44534545 -0.36165412
## [7,] -0.17735479 0.8329039 -0.17100253 -0.18434043
## [8,] 0.38622192 0.1399065 -0.38043288 -0.26771881
## [9,] -0.44799674 -0.6004909 0.37420473 -0.10742138
## [10,] -0.41168509 -0.6000540 0.42574974 -0.09447078
alpha(X)
## Warning in alpha(X): Some items were negatively correlated with the total scale and probably
## should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
## Some items ( marketing ) were negatively correlated with the total scale and
## probably should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
##
## Reliability analysis
## Call: alpha(x = X)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.68 0.7 0.76 0.37 2.3 0.053 5.4 2 0.36
##
## lower alpha upper 95% confidence boundaries
## 0.58 0.68 0.79
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r
## financas 0.47 0.51 0.63 0.26 1.03 0.098 0.2300
## custos 0.44 0.45 0.54 0.21 0.81 0.099 0.1865
## marketing 0.90 0.90 0.87 0.76 9.43 0.018 0.0024
## atuaria 0.48 0.49 0.58 0.24 0.96 0.093 0.1975
## med.r
## financas 0.0031
## custos -0.0297
## marketing 0.7558
## atuaria 0.0031
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## financas 100 0.85 0.84 0.804 0.653 5.8 3.1
## custos 100 0.87 0.89 0.904 0.754 4.7 2.5
## marketing 100 0.33 0.32 -0.025 -0.027 5.7 2.8
## atuaria 100 0.84 0.85 0.851 0.691 5.3 2.5