R Markdown

Componentes Principais:Exemplo aplicação

# install.packages("Matrix")
# Exemplo 1: MMDS.ORG

S <- matrix(c(1,-2,0,-2,5,0,0,0,2),3,3)

eigS <- eigen(S)
Svec <- round(eigS$vectors,digits=3)
Sval <- round(eigS$values,digits=3)

eigS
## eigen() decomposition
## $values
## [1] 5.8284271 2.0000000 0.1715729
## 
## $vectors
##            [,1] [,2]      [,3]
## [1,] -0.3826834    0 0.9238795
## [2,]  0.9238795    0 0.3826834
## [3,]  0.0000000    1 0.0000000
Svec
##        [,1] [,2]  [,3]
## [1,] -0.383    0 0.924
## [2,]  0.924    0 0.383
## [3,]  0.000    1 0.000
Sval
## [1] 5.828 2.000 0.172
# Verificando que o traço da matriz de variância-covariância é igual à soma dos autovalores.
sum(diag(S)) # Traço da matriz sigma
## [1] 8
sum(Sval)  # Soma dos autovalores
## [1] 8
round(cumsum(Sval)/sum(Sval)*100, digits=2)
## [1]  72.85  97.85 100.00
# Biplot com dados de Greenacre
# Exemplo explicado pelo paper da Fundación BBVA

xm <- matrix(c(2,2,1,2,-1,1,1,-1,2,-2),5,2,byrow=T)
ym <- matrix(c(3,2,-1,-2,1,-1,2,-1),2,4,byrow=T)

colnames(ym) <- c("y1","y2","y3","y4")
rownames(xm) <- c("x1","x2","x3","x4","x5")

biplot(xm,t(ym), col=c("darkgreen","red"),
       cex=1.2, expand = 2,
       xlim = c(-3,4), ylim = c(-3,3))
abline(v=0,h=0,lty=3, col = "black" )

dm <- matrix(c(8,2,2,-6,
               5,0,3,-4,
               -2,-3,3,1,
               2,3,-3,-1,
               4,6,-6,-2),5,4,byrow=T)

biplot(prcomp(dm))
abline(v=0,h=0)

# Exemplo 8.10 de J&W: Taxas de retorno semanais

setwd("C:/Users/ricardo.ramos/OneDrive - Ministerio do Desenvolvimento da Industria e Comercio Exterior/UNB/Multivariada/R-Multivariada")

stock <- read.delim("T8-4.txt",header=F)
stock[1:5,]
##           V1         V2         V3         V4         V5
## 1  0.0130338 -0.0078431 -0.0031889 -0.0447693  0.0052151
## 2  0.0084862  0.0166886 -0.0062100  0.0119560  0.0134890
## 3 -0.0179153 -0.0086393  0.0100360  0.0000000 -0.0061428
## 4  0.0215589 -0.0034858  0.0174353 -0.0285917 -0.0069534
## 5  0.0108225  0.0037167 -0.0101345  0.0291900  0.0409751
#library(data.table)
nomes <- c("JPMorgan", "Citibank", "WellsFargo", 
           "Shell", "Exxon")
setnames(stock,nomes)
stock[1:5,]
##     JPMorgan   Citibank WellsFargo      Shell      Exxon
## 1  0.0130338 -0.0078431 -0.0031889 -0.0447693  0.0052151
## 2  0.0084862  0.0166886 -0.0062100  0.0119560  0.0134890
## 3 -0.0179153 -0.0086393  0.0100360  0.0000000 -0.0061428
## 4  0.0215589 -0.0034858  0.0174353 -0.0285917 -0.0069534
## 5  0.0108225  0.0037167 -0.0101345  0.0291900  0.0409751
cpstock <- prcomp(~ JPMorgan + Citibank + WellsFargo
                  + Shell + Exxon, 
                data=stock, scale = F)

plot(cpstock)

screeplot(cpstock,type=c("lines")) 

summary(cpstock)
## Importance of components:
##                            PC1     PC2     PC3     PC4     PC5
## Standard deviation     0.03698 0.02648 0.01593 0.01194 0.01090
## Proportion of Variance 0.52926 0.27133 0.09822 0.05518 0.04601
## Cumulative Proportion  0.52926 0.80059 0.89881 0.95399 1.00000
cpstock
## Standard deviations (1, .., p=5):
## [1] 0.03698213 0.02647942 0.01593118 0.01194163 0.01090352
## 
## Rotation (n x k) = (5 x 5):
##                   PC1        PC2         PC3        PC4         PC5
## JPMorgan   -0.2228228  0.6252260 -0.32611218  0.6627590 -0.11765952
## Citibank   -0.3072900  0.5703900  0.24959014 -0.4140935  0.58860803
## WellsFargo -0.1548103  0.3445049  0.03763929 -0.4970499 -0.78030428
## Shell      -0.6389680 -0.2479475  0.64249741  0.3088689 -0.14845546
## Exxon      -0.6509044 -0.3218478 -0.64586064 -0.2163758  0.09371777
cpstock$rotation
##                   PC1        PC2         PC3        PC4         PC5
## JPMorgan   -0.2228228  0.6252260 -0.32611218  0.6627590 -0.11765952
## Citibank   -0.3072900  0.5703900  0.24959014 -0.4140935  0.58860803
## WellsFargo -0.1548103  0.3445049  0.03763929 -0.4970499 -0.78030428
## Shell      -0.6389680 -0.2479475  0.64249741  0.3088689 -0.14845546
## Exxon      -0.6509044 -0.3218478 -0.64586064 -0.2163758  0.09371777
biplot(cpstock,col=c("black","red"))

Including Plots

##     V1    V2    V3   V4   V5
## 1 2.67  5.71 69.02 30.3 1.48
## 2 2.25  4.37 72.98 43.3 1.44
## 3 3.12 10.27 64.94 32.0 2.11
## 4 5.14  7.44 71.29 24.5 1.85
## 5 5.54  9.25 74.94 31.0 2.23
##   Population Degree Employed Government Median
## 1       2.67   5.71    69.02       30.3   1.48
## 2       2.25   4.37    72.98       43.3   1.44
## 3       3.12  10.27    64.94       32.0   2.11
## 4       5.14   7.44    71.29       24.5   1.85
## 5       5.54   9.25    74.94       31.0   2.23

## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5
## Standard deviation     10.345 6.2986 2.89324 1.69348 0.39331
## Proportion of Variance  0.677 0.2510 0.05295 0.01814 0.00098
## Cumulative Proportion   0.677 0.9279 0.98088 0.99902 1.00000
## Standard deviations (1, .., p=5):
## [1] 10.3448177  6.2985820  2.8932449  1.6934798  0.3933104
## 
## Rotation (n x k) = (5 x 5):
##                     PC1         PC2         PC3         PC4          PC5
## Population  0.038887287 -0.07114494  0.18789258  0.97713524 -0.057699864
## Degree     -0.105321969 -0.12975236 -0.96099580  0.17135181 -0.138554092
## Employed    0.492363944 -0.86438807  0.04579737 -0.09104368  0.004966048
## Government -0.863069865 -0.48033178  0.15318538 -0.02968577  0.006691800
## Median     -0.009122262 -0.01474342 -0.12498114  0.08170118  0.988637470
##                     PC1         PC2         PC3         PC4          PC5
## Population  0.038887287 -0.07114494  0.18789258  0.97713524 -0.057699864
## Degree     -0.105321969 -0.12975236 -0.96099580  0.17135181 -0.138554092
## Employed    0.492363944 -0.86438807  0.04579737 -0.09104368  0.004966048
## Government -0.863069865 -0.48033178  0.15318538 -0.02968577  0.006691800
## Median     -0.009122262 -0.01474342 -0.12498114  0.08170118  0.988637470

## Exemplo 8.18 de J&W: Recordes nacionais para provas de corrida - mulheres.

setwd("C:/Users/ricardo.ramos/OneDrive - Ministerio do Desenvolvimento da Industria e Comercio Exterior/UNB/Multivariada/R-Multivariada")

library(Matrix)
recor <- read.delim("T1-9.dat",header=F)
recor[1:5,]
##    V1    V2    V3    V4   V5   V6   V7     V8
## 1 ARG 11.57 22.94 52.50 2.05 4.25 9.19 150.32
## 2 AUS 11.12 22.23 48.63 1.98 4.02 8.63 143.51
## 3 AUT 11.15 22.70 50.62 1.94 4.05 8.78 154.35
## 4 BEL 11.14 22.48 51.45 1.97 4.08 8.82 143.05
## 5 BER 11.46 23.05 53.30 2.07 4.29 9.81 174.18
nomes <- c("Pais","v100m","v200m","v400m","v800m",
           "v1500m","v3000m","Maratona")
setnames(recor,nomes)
recor[1:5,]
##   Pais v100m v200m v400m v800m v1500m v3000m Maratona
## 1  ARG 11.57 22.94 52.50  2.05   4.25   9.19   150.32
## 2  AUS 11.12 22.23 48.63  1.98   4.02   8.63   143.51
## 3  AUT 11.15 22.70 50.62  1.94   4.05   8.78   154.35
## 4  BEL 11.14 22.48 51.45  1.97   4.08   8.82   143.05
## 5  BER 11.46 23.05 53.30  2.07   4.29   9.81   174.18
#var(recor[,2:8])
#cor(recor[,2:8])

cprecor <- prcomp(~ v100m + v200m + v400m + v800m + v1500m
                  + v3000m + Maratona, 
                  data=recor, scale = T)

attributes(cprecor)
## $names
## [1] "sdev"     "rotation" "center"   "scale"    "x"        "call"    
## 
## $class
## [1] "prcomp"
plot(cprecor) # ou screeplot(cprecor,type=c("lines"))

plot(cumsum(cprecor$sdev**2)/sum(cprecor$sdev**2), 
     type="b", ylim = c(0.8,1),
     xlab="CPs", ylab="Proporção da Variância")

summary(cprecor)
## Importance of components:
##                           PC1     PC2    PC3     PC4    PC5     PC6
## Standard deviation     2.4099 0.79290 0.5285 0.35292 0.3016 0.23349
## Proportion of Variance 0.8297 0.08981 0.0399 0.01779 0.0130 0.00779
## Cumulative Proportion  0.8297 0.91947 0.9594 0.97717 0.9902 0.99796
##                            PC7
## Standard deviation     0.11959
## Proportion of Variance 0.00204
## Cumulative Proportion  1.00000
cprecor
## Standard deviations (1, .., p=7):
## [1] 2.4099013 0.7929019 0.5285211 0.3529231 0.3016152 0.2334927 0.1195921
## 
## Rotation (n x k) = (7 x 7):
##                 PC1        PC2        PC3         PC4         PC5
## v100m    -0.3777657  0.4071756  0.1405803 -0.58706293  0.16706891
## v200m    -0.3832103  0.4136291  0.1007833 -0.19407501 -0.09350016
## v400m    -0.3680361  0.4593531 -0.2370255  0.64543118 -0.32727328
## v800m    -0.3947810 -0.1612459 -0.1475424  0.29520804  0.81905467
## v1500m   -0.3892610 -0.3090877  0.4219855  0.06669044 -0.02613100
## v3000m   -0.3760945 -0.4231899  0.4060627  0.08015699 -0.35169796
## Maratona -0.3552031 -0.3892153 -0.7410610 -0.32107640 -0.24700821
##                  PC6         PC7
## v100m    -0.53969730 -0.08893934
## v200m     0.74493139  0.26565662
## v400m    -0.24009405 -0.12660435
## v800m     0.01650651  0.19521315
## v1500m    0.18898771 -0.73076817
## v3000m   -0.24049968  0.57150644
## Maratona  0.04826992 -0.08208401
biplot(cprecor,col=c("black","red"),
       xlabs=recor$Pais,
       cex=c(1,1.2))

cprecor$x[,1]
##            1            2            3            4            5 
## -0.393240234  1.931642887  1.262520373  1.291730279 -1.396108552 
##            6            7            8            9           10 
##  1.006778878  1.734340591 -0.811838204  2.989466907 -0.001927672 
##           11           12           13           14           15 
## -7.906227224 -2.166811506  2.406030321  0.082495533 -2.192409809 
##           16           17           18           19           20 
##  1.266731340  2.518345696  3.047516603  2.442706280  1.197800425 
##           21           22           23           24           25 
## -3.294123799  0.788251063 -1.741942057  0.354256642  1.035907216 
##           26           27           28           29           30 
## -0.574161730  1.547452839  0.481657610  0.917735409 -0.830794629 
##           31           32           33           34           35 
## -1.455347346 -1.721467731 -1.495210140 -1.749727754  0.995766285 
##           36           37           38           39           40 
## -0.815981458  1.544760622  0.755235487  0.553003461 -5.257449747 
##           41           42           43           44           45 
## -1.763533682  2.273765780  1.175249957  2.123005711  3.042948214 
##           46           47           48           49           50 
## -8.213415123 -3.093919517  1.889462264  0.839149567  1.113545239 
##           51           52           53           54 
## -0.659093139 -1.223805050  0.850127798  3.299148823
# Verificando a ordem dos Países em relação à 1ª componente principal
recor$Pais[order(cprecor$x[,1],decreasing = T)[recor$Pais]]
##  [1] USA    GER    RUS    CHN    FRA    GBR    CZE    POL    ROM    ESP   
## [11] AUS    CAN    ITA    NED    BEL    AUT    GRE    SUI    POR    IRL   
## [21] BRA    MEX    KEN    TUR    SWE    HUN    NZL    NOR    JPN    DEN   
## [31] IND    COL    ARG    TPE    ISR    CHI    MYA    THA    KOR, S KOR, N
## [41] BER    MAS    LUX    INA    MRI    PHI    CRC    FIN    SIN    DOM   
## [51] PNG    GUA    COK    SAM   
## 54 Levels: ARG AUS AUT BEL BER BRA CAN CHI CHN COK COL CRC CZE DEN ... USA
# Verificando a ordem dos Países em relação à 2ª componente principal
recor$Pais[order(cprecor$x[,2],decreasing = T)[recor$Pais]]
##  [1] KOR, N KEN    LUX    NOR    SIN    CHI    IRL    KOR, S DEN    JPN   
## [11] POR    MYA    HUN    TUR    GUA    MRI    COK    NZL    INA    ROM   
## [21] NED    ITA    ARG    GBR    BEL    SWE    CHN    AUT    ISR    IND   
## [31] ESP    CAN    CRC    FIN    BRA    DOM    RUS    AUS    MEX    MAS   
## [41] POL    PHI    CZE    BER    GRE    THA    GER    SUI    TPE    COL   
## [51] USA    FRA    PNG    SAM   
## 54 Levels: ARG AUS AUT BEL BER BRA CAN CHI CHN COK COL CRC CZE DEN ... USA
# Analisando a relação entre a 1ª CP e a 3ª CP
biplot(cprecor, choices=c(1,3))

Estudo dirigido: Referência-Mingoti

Verificar divergência valor dos escores

setwd("C:/Users/ricardo.ramos/OneDrive - Ministerio do Desenvolvimento da Industria e Comercio Exterior/UNB/Multivariada/R-Multivariada")

empresas <- read.delim("MingotiTB3.1.tsv",header=T)


empresas <- tbl_df(empresas)

filter_empresas<-empresas %>% select(gbt,glq,patr)

# Matriz de covariância
cov(filter_empresas)
##             gbt       glq       patr
## gbt   9550608.6 706121.06 14978232.5
## glq    706121.1  76269.52   933915.1
## patr 14978232.5 933915.06 34408113.0
cpempresas <- prcomp(~ gbt + glq + patr, 
                  data=empresas, scale = F)

attributes(cpempresas)
## $names
## [1] "sdev"     "rotation" "center"   "scale"    "x"        "call"    
## 
## $class
## [1] "prcomp"
plot(cpempresas) # ou

screeplot(cpempresas,type=c("lines"))

plot(cumsum(cpempresas$sdev**2)/sum(cpempresas$sdev**2), 
     type="b", ylim = c(0.8,1),
     xlab="CPs", ylab="Proporção da Variância")

summary(cpempresas)
## Importance of components:
##                              PC1       PC2       PC3
## Standard deviation     6440.0615 1.594e+03 145.23266
## Proportion of Variance    0.9418 5.767e-02   0.00048
## Cumulative Proportion     0.9418 9.995e-01   1.00000
cpempresas
## Standard deviations (1, .., p=3):
## [1] 6440.0615 1593.5831  145.2327
## 
## Rotation (n x k) = (3 x 3):
##              PC1         PC2         PC3
## gbt  -0.42509725 -0.89970680 -0.09909593
## glq  -0.02766083 -0.09651661  0.99494694
## patr -0.90472493  0.42569029  0.01614231
biplot(cpempresas,col=c("black","red"),
       xlabs=empresas$emp,
       cex=c(1,1.2))

cpempresas$x[,1]
##           1           2           3           4           5           6 
##  -8857.5943  -8079.3608 -11257.9265    690.7986  -3844.0909   5915.4162 
##           7           8           9          10          11          12 
##   5504.9703   3796.3802   7729.1502   3848.1751   3989.1624    564.9194
# Verificando a ordem dos Países em relação à 1ª componente principal
empresas$emp[order(cpempresas$x[,1],decreasing = T)[empresas$emp]]
##  [1] E9  E10 E8  E4  E12 E5  E2  E1  E3  E6  E7  E11
## Levels: E1 E10 E11 E12 E2 E3 E4 E5 E6 E7 E8 E9

Exemplo 3.2 Mingoti

setwd("C:/Users/ricardo.ramos/OneDrive - Ministerio do Desenvolvimento da Industria e Comercio Exterior/UNB/Multivariada/R-Multivariada")

coxinhas <- read.delim("MingotiTB3.5.tsv",header=T)


coxinhas <- tbl_df(coxinhas)

filter_coxinhas<-coxinhas %>% select(sabor, aroma,massa, recheio)

# Matriz de covariância
cov(filter_coxinhas)
##             sabor     aroma     massa   recheio
## sabor   0.4069071 0.1822107 0.3573571 0.5504750
## aroma   0.1822107 0.1104411 0.1796357 0.2713804
## massa   0.3573571 0.1796357 0.4242000 0.5897357
## recheio 0.5504750 0.2713804 0.5897357 0.9105696
cpcoxinhas <- prcomp(~ sabor + aroma + massa +recheio,
                  data=coxinhas, scale = F)

attributes(cpcoxinhas)
## $names
## [1] "sdev"     "rotation" "center"   "scale"    "x"        "call"    
## 
## $class
## [1] "prcomp"
plot(cpcoxinhas) # ou

screeplot(cpcoxinhas,type=c("lines"))

plot(cumsum(cpcoxinhas$sdev**2)/sum(cpcoxinhas$sdev**2), 
     type="b", ylim = c(0.8,1),
     xlab="CPs", ylab="Proporção da Variância")

summary(cpcoxinhas)
## Importance of components:
##                           PC1     PC2     PC3     PC4
## Standard deviation     1.3179 0.25473 0.16707 0.14992
## Proportion of Variance 0.9378 0.03503 0.01507 0.01214
## Cumulative Proportion  0.9378 0.97279 0.98786 1.00000
cpcoxinhas
## Standard deviations (1, .., p=4):
## [1] 1.3178933 0.2547255 0.1670736 0.1499212
## 
## Rotation (n x k) = (4 x 4):
##               PC1        PC2        PC3         PC4
## sabor   0.4557692  0.8160940 -0.1121458 -0.33717690
## aroma   0.2234675  0.2149422 -0.2691445  0.91182420
## massa   0.4770120 -0.4564207 -0.7177864 -0.22118397
## recheio 0.7174930 -0.2819051  0.6322715  0.07724004
biplot(cpcoxinhas,col=c("black","red"),
       xlabs=coxinhas$marca,
       cex=c(1,1.2))

cpcoxinhas$x[,1]
##           1           2           3           4           5           6 
## -1.72821832 -0.25202070 -0.87331991  1.60440053  1.29635636 -1.42969189 
##           7           8 
##  1.42970986 -0.04721593
# Verificando a ordem dos Países em relação à 1ª componente principal
coxinhas$marca[order(cpcoxinhas$x[,1],decreasing = T)[coxinhas$marca]]
## [1] M4 M7 M5 M8 M2 M3 M6 M1
## Levels: M1 M2 M3 M4 M5 M6 M7 M8

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.