Um analista de mercado quer estudar as relações estruturais entre quatro indicadores financeiros provenientes de 45 empresas.
Rótulos: Código da empresa (Cód_Emp)
Indicadores: Prazo médio de recebimento de vendas (PMRV, em dias); Endividamento (em %); Vendas (em R$ x mil); Margem líquida de vendas (em %).
# digite o local correto do arquivo
Indic_finan_Fatorial <- read_excel("Indic_finan_Fatorial.xls")
head(Indic_finan_Fatorial)
dados <- Indic_finan_Fatorial[,2:5]
head(dados)
# Estatísticas Descritivas
describe(dados)
dados
4 Variables 45 Observations
---------------------------------------------------------------------------------------------------
PMRV
n missing unique Info Mean .05 .10 .25 .50 .75 .90 .95
45 0 38 1 53.13 12.20 15.41 25.68 49.22 82.39 96.30 97.16
lowest : 6.42 7.49 11.77 13.91 14.98, highest: 95.23 96.30 97.37 99.51 99.80
---------------------------------------------------------------------------------------------------
Endividamento
n missing unique Info Mean .05 .10 .25 .50 .75 .90 .95
45 0 44 1 31.71 16.46 17.68 22.00 29.75 39.00 48.49 52.49
lowest : 14.77 15.09 16.26 17.23 17.55, highest: 48.58 50.61 52.97 53.50 69.44
---------------------------------------------------------------------------------------------------
Vendas
n missing unique Info Mean .05 .10 .25 .50 .75 .90 .95
45 0 45 1 3989 2080 2236 2921 3719 4770 6167 6765
lowest : 1981 2003 2074 2106 2129, highest: 6390 6637 6797 6953 9641
---------------------------------------------------------------------------------------------------
Margem_liquida
n missing unique Info Mean .05 .10 .25 .50 .75 .90 .95
45 0 37 1 13.22 8.736 9.138 10.165 13.054 16.906 17.848 18.062
lowest : 8.453 8.560 8.700 8.881 9.095, highest: 17.240 17.655 17.976 18.083 18.190
---------------------------------------------------------------------------------------------------
boxplot(dados)
dados.pad <- scale(dados)
boxplot(dados.pad)
O box-plot permite a identificação de possíveis outliers (univariados). Pode-se pensar na exclusão desses dados. Cuidado!
# outliers
# Billor, N., Hadi, A. S., and Velleman , P. F. (2000). BACON: Blocked Adaptive Computationally-Efficient Outlier Nominators; Computational Statistics and Data Analysis 34, 279–298.
outliers <- mvBACON(dados)
MV-BACON (subset no. 1): 16 of 45 (35.56 %)
MV-BACON (subset no. 2): 38 of 45 (84.44 %)
MV-BACON (subset no. 3): 40 of 45 (88.89 %)
MV-BACON (subset no. 4): 41 of 45 (91.11 %)
MV-BACON (subset no. 5): 41 of 45 (91.11 %)
# outliers
# table(Indic_finan_Fatorial$Cod_Emp, outliers$subset)
plot(Indic_finan_Fatorial$Cod_Emp, outliers$subset)
# gráfico com o pacote ggplot2
dados2 <- data.frame(Indic_finan_Fatorial$Cod_Emp, outliers$subset)
ggplot(data = dados2, aes(x = Indic_finan_Fatorial.Cod_Emp, y = outliers.subset )) +
geom_point()+
geom_text(aes(label=Indic_finan_Fatorial.Cod_Emp))
# Verificação de normalidade
# http://www.biosoft.hacettepe.edu.tr/MVN/
# testes univariados
uniPlot(dados, type = "qqplot") # creates univariate Q-Q plots
uniPlot(dados, type = "histogram") # creates univariate histograms
# univariate normality tests:
# SW: Shapiro-Wilk,
# CVM: Cramer-von Mises,
# Lillie: Lilliefors (Kolmogorov-Smirnov),
# SF: Shapiro-Francia,
# AD: Anderson-Darling
uniNorm(dados, type = "SW", desc=F)
$`Descriptive Statistics`
NULL
$`Shapiro-Wilk's Normality Test`
uniNorm(dados, type = "CVM" , desc = F)
$`Descriptive Statistics`
NULL
$`Cramer-von Mises's Normality Test`
uniNorm(dados, type = "Lillie", desc = F)
$`Descriptive Statistics`
NULL
$`Lilliefors (Kolmogorov-Smirnov)'s Normality Test`
uniNorm(dados, type = "SF", desc = F)
$`Descriptive Statistics`
NULL
$`Shapiro-Francia's Normality Test`
uniNorm(dados, type = "AD", desc = F)
$`Descriptive Statistics`
NULL
$`Anderson-Darling's Normality Test`
# testes multivariados
hzTest(dados)
Henze-Zirkler's Multivariate Normality Test
---------------------------------------------
data : dados
HZ : 1.2577
p-value : 0.0001032643
Result : Data are not multivariate normal.
---------------------------------------------
roystonTest(dados)
Royston's Multivariate Normality Test
---------------------------------------------
data : dados
H : 35.05432
p-value : 4.837587e-07
Result : Data are not multivariate normal.
---------------------------------------------
mardiaTest(dados)
Mardia's Multivariate Normality Test
---------------------------------------
data : dados
g1p : 4.203704
chi.skew : 31.52778
p.value.skew : 0.04859737
g2p : 23.68478
z.kurtosis : -0.1526065
p.value.kurt : 0.8787086
chi.small.skew : 34.53042
p.value.small : 0.02275265
Result : Data are not multivariate normal.
---------------------------------------
# correlação
mcor <- rcorr(as.matrix(dados))
mcor
PMRV Endividamento Vendas Margem_liquida
PMRV 1.00 0.23 0.63 0.60
Endividamento 0.23 1.00 0.24 -0.10
Vendas 0.63 0.24 1.00 0.58
Margem_liquida 0.60 -0.10 0.58 1.00
n= 45
P
PMRV Endividamento Vendas Margem_liquida
PMRV 0.1210 0.0000 0.0000
Endividamento 0.1210 0.1148 0.5227
Vendas 0.0000 0.1148 0.0000
Margem_liquida 0.0000 0.5227 0.0000
R <- cor(dados)
corrplot(R, method="number",type="upper", order = "hclust", tl.srt = 45)
corrplot(R, method="circle",type="full", order = "hclust", tl.srt = 45)
# ?corrplot
Observa-se que há altas correlações entre as variáveis Vendas, PMRV e Margem_líquida (p-valor < 5%) Existe considerável número de correlações superiores a 0,30 a normalidade multivariada não pode ser evidenciada, contudo podemos usar métodos robustos para estrair os fatores, como por exemplo o método dos componentes principais.
################################################################
#Partial correlation matrix
################################################################
partial.cor <- function (x)
{
R <- cor(x)
RI <- solve(R)
D <- 1/sqrt(diag(RI))
Rp <- -RI * (D %o% D)
diag(Rp) <- 0
rownames(Rp) <- colnames(Rp) <- colnames(x)
Rp
}
mat_anti_imagem <- -partial.cor(dados)
mat_anti_imagem
PMRV Endividamento Vendas Margem_liquida
PMRV 0.0000000 -0.2520021 -0.3380240 -0.4265296
Endividamento -0.2520021 0.0000000 -0.2462634 0.3685448
Vendas -0.3380240 -0.2462634 0.0000000 -0.3869238
Margem_liquida -0.4265296 0.3685448 -0.3869238 0.0000000
################################################################
# The Bartlett's test statistic indicates to what extent we deviate from the reference situation |R| = 1.
################################################################
Bartlett.sphericity.test <- function(x)
{
method <- "Bartlett's test of sphericity"
data.name <- deparse(substitute(x))
x <- subset(x, complete.cases(x)) # Omit missing values
n <- nrow(x)
p <- ncol(x)
chisq <- (1-n+(2*p+5)/6)*log(det(cor(x)))
df <- p*(p-1)/2
p.value <- pchisq(chisq, df, lower.tail=FALSE)
names(chisq) <- "X-squared"
names(df) <- "df"
return(structure(list(statistic=chisq, parameter=df, p.value=p.value,
method=method, data.name=data.name), class="htest"))
}
Bartlett.sphericity.test(dados)
Bartlett's test of sphericity
data: dados
X-squared = 53.165, df = 6, p-value = 1.087e-09
################################################################
# KMO index
################################################################
kmo <- function(x)
{
x <- subset(x, complete.cases(x)) # Omit missing values
r <- cor(x) # Correlation matrix
r2 <- r^2 # Squared correlation coefficients
i <- solve(r) # Inverse matrix of correlation matrix
d <- diag(i) # Diagonal elements of inverse matrix
p2 <- (-i/sqrt(outer(d, d)))^2 # Squared partial correlation coefficients
diag(r2) <- diag(p2) <- 0 # Delete diagonal elements
KMO <- sum(r2)/(sum(r2)+sum(p2))
MSA <- colSums(r2)/(colSums(r2)+colSums(p2))
return(list(KMO=KMO, MSA=MSA))
}
kmo(dados)
$KMO
[1] 0.6309025
$MSA
PMRV Endividamento Vendas Margem_liquida
0.6909296 0.3183487 0.7071779 0.6008364
o KMO torna razoável a aplicação da AF o teste de esfericidade de Bartlett rejeita a hipótese de a matriz de correlações ser identidade
O MSA é adequado, com exceção da variável individamento (0,318). Se a comunalidade for alta, esta variável pode ser sozinha um Fator.
n.dados <- length(dados)
fit <- principal(dados, nfactors=n.dados, rotate="none")
fit
Principal Components Analysis
Call: principal(r = dados, nfactors = n.dados, rotate = "none")
Standardized loadings (pattern matrix) based upon correlation matrix
PC1 PC2 PC3 PC4 h2 u2 com
PMRV 0.88 0.04 0.40 -0.27 1 2.2e-16 1.6
Endividamento 0.27 0.94 0.04 0.21 1 1.1e-16 1.3
Vendas 0.87 0.06 -0.46 -0.17 1 1.0e-15 1.6
Margem_liquida 0.81 -0.43 0.06 0.41 1 0.0e+00 2.1
PC1 PC2 PC3 PC4
SS loadings 2.24 1.07 0.38 0.31
Proportion Var 0.56 0.27 0.09 0.08
Cumulative Var 0.56 0.83 0.92 1.00
Proportion Explained 0.56 0.27 0.09 0.08
Cumulative Proportion 0.56 0.83 0.92 1.00
Mean item complexity = 1.6
Test of the hypothesis that 4 components are sufficient.
The root mean square of the residuals (RMSR) is 0
with the empirical chi square 0 with prob < NA
Fit based upon off diagonal values = 1
fit$values
[1] 2.2424292 1.0708642 0.3757126 0.3109940
#fit$scores
fit$weights
PC1 PC2 PC3 PC4
PMRV 0.3905627 0.04124629 1.05293484 -0.8777025
Endividamento 0.1197428 0.87756939 0.09898185 0.6696605
Vendas 0.3872237 0.05679122 -1.23298856 -0.5353488
Margem_liquida 0.3593400 -0.39846036 0.15125762 1.3077046
fit$loadings
Loadings:
PC1 PC2 PC3 PC4
PMRV 0.876 0.396 -0.273
Endividamento 0.269 0.940 0.208
Vendas 0.868 -0.463 -0.166
Margem_liquida 0.806 -0.427 0.407
PC1 PC2 PC3 PC4
SS loadings 2.242 1.071 0.376 0.311
Proportion Var 0.561 0.268 0.094 0.078
Cumulative Var 0.561 0.828 0.922 1.000
fit$communality
PMRV Endividamento Vendas Margem_liquida
1 1 1 1
# Escolha a quantidade de fatores
fit1 <- principal(dados, nfactors=2, rotate="none")
fit1
Principal Components Analysis
Call: principal(r = dados, nfactors = 2, rotate = "none")
Standardized loadings (pattern matrix) based upon correlation matrix
PC1 PC2 h2 u2 com
PMRV 0.88 0.04 0.77 0.231 1.0
Endividamento 0.27 0.94 0.96 0.045 1.2
Vendas 0.87 0.06 0.76 0.242 1.0
Margem_liquida 0.81 -0.43 0.83 0.169 1.5
PC1 PC2
SS loadings 2.24 1.07
Proportion Var 0.56 0.27
Cumulative Var 0.56 0.83
Proportion Explained 0.68 0.32
Cumulative Proportion 0.68 1.00
Mean item complexity = 1.2
Test of the hypothesis that 2 components are sufficient.
The root mean square of the residuals (RMSR) is 0.09
with the empirical chi square 4.29 with prob < NA
Fit based upon off diagonal values = 0.96
#fit1$values
#fit1$scores
#fit1$weights
fit1$loadings
Loadings:
PC1 PC2
PMRV 0.876
Endividamento 0.269 0.940
Vendas 0.868
Margem_liquida 0.806 -0.427
PC1 PC2
SS loadings 2.242 1.071
Proportion Var 0.561 0.268
Cumulative Var 0.561 0.828
fit1$communality
PMRV Endividamento Vendas Margem_liquida
0.7689927 0.9552446 0.7576810 0.8313751
Observando as comunalidades, há forte relação com os fatores extraídos. Comente! A variável endividamento possui alta comunalidade. Sendo possível considerá-la como um Fator. A extração de dois Fatores explicam cerca de 82,8% da variabilidade total dos dados Há evidência de que no Fator 1 seja predominante as variáveis: PMRV, vendas e Margem_líquida e no Fator 2 variável endividamento. Não é trivial, porém é possível sugerir um nome para o Fator 1 (volume de negócios-faturamento) e para o Fator 2 (estrutura de capital)
biplot(fit1)
O biplot representa representa a relação entre as variáveis e os fatores (após a rotação Varimax nesse caso). Endividamento possui elevada carga no Fator 2 e as demais variáveis no Fator 1
# varimax rotation
fit2 <- principal(dados, nfactors=2, rotate="varimax")
fit2
Principal Components Analysis
Call: principal(r = dados, nfactors = 2, rotate = "varimax")
Standardized loadings (pattern matrix) based upon correlation matrix
RC1 RC2 h2 u2 com
PMRV 0.85 0.21 0.77 0.231 1.1
Endividamento 0.08 0.97 0.96 0.045 1.0
Vendas 0.84 0.23 0.76 0.242 1.1
Margem_liquida 0.87 -0.26 0.83 0.169 1.2
RC1 RC2
SS loadings 2.20 1.12
Proportion Var 0.55 0.28
Cumulative Var 0.55 0.83
Proportion Explained 0.66 0.34
Cumulative Proportion 0.66 1.00
Mean item complexity = 1.1
Test of the hypothesis that 2 components are sufficient.
The root mean square of the residuals (RMSR) is 0.09
with the empirical chi square 4.29 with prob < NA
Fit based upon off diagonal values = 0.96
#fit2$values
fit2$scores
RC1 RC2
[1,] 0.21399479 -0.65575882
[2,] 1.55410548 0.18021964
[3,] 0.90652014 0.32533142
[4,] 1.27155963 -0.92876502
[5,] 1.45483704 -1.41169129
[6,] 1.60845440 1.22298871
[7,] 1.31555933 0.04129477
[8,] 1.76312063 -0.39205557
[9,] 2.49839731 0.30008576
[10,] 0.51881067 -1.45715479
[11,] 0.81519748 -1.24500621
[12,] 0.46002693 2.88534978
[13,] -0.30263148 1.46152848
[14,] -0.27371587 -0.69354358
[15,] -0.07300781 0.23224939
[16,] -0.69430520 -0.66951638
[17,] -0.17537610 0.27031820
[18,] -0.94052968 -0.21279441
[19,] -0.76839192 0.11621117
[20,] -1.15962650 -1.31196319
[21,] -0.88577481 -0.63487636
[22,] -1.10850722 -0.95735728
[23,] -1.25273882 0.18741700
[24,] -1.32608825 -0.60682797
[25,] -1.39461142 1.30963062
[26,] -1.57925069 -0.63108176
[27,] -0.64085654 1.58960822
[28,] -0.16416555 -1.02997919
[29,] -0.82328567 -0.91571359
[30,] -0.02856916 0.90416662
[31,] -1.46292835 0.96211786
[32,] -0.94329724 1.83186078
[33,] -0.58827902 0.81001784
[34,] 0.28426546 0.42583701
[35,] 0.27062217 0.08429988
[36,] 0.52442472 -1.24506751
[37,] -0.34776942 -1.35261727
[38,] 0.11098684 -0.17599485
[39,] -0.54662546 0.13336865
[40,] 1.12134352 0.64673045
[41,] -0.40943545 -0.85208333
[42,] 0.46099025 -0.94715611
[43,] -0.65654291 0.42121750
[44,] 0.20005872 0.85079334
[45,] 1.19303503 1.13436142
fit2$weights
RC1 RC2
PMRV 0.37496743 0.1167900
Endividamento -0.05409636 0.8840475
Vendas 0.36865439 0.1313825
Margem_liquida 0.43029192 -0.3205382
fit2$loadings
Loadings:
RC1 RC2
PMRV 0.850 0.215
Endividamento 0.974
Vendas 0.840 0.229
Margem_liquida 0.874 -0.261
RC1 RC2
SS loadings 2.198 1.116
Proportion Var 0.549 0.279
Cumulative Var 0.549 0.828
fit2$communality
PMRV Endividamento Vendas Margem_liquida
0.7689927 0.9552446 0.7576810 0.8313751
biplot(fit1)
biplot(fit2)
#########
# critério da soma ponderada e ordenamento
# formação de rankings das empresas / indicador de desempenho
# cálculo com a rotação varimax
fit2$values
[1] 2.2424292 1.0708642 0.3757126 0.3109940
peso1 <- fit2$values[1]/sum(fit2$values)
peso2 <- fit2$values[2]/sum(fit2$values)
classificacao <- fit2$scores[,1]*peso1+fit2$scores[,2]*peso2
desempenho_emp <- cbind.data.frame(Indic_finan_Fatorial,classificacao)
desempenho_emp[order(desempenho_emp$classificacao,decreasing=c(TRUE)),]