library(MASS)
## Warning: package 'MASS' was built under R version 4.0.5
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::select() masks MASS::select()
library(GGally)
## Warning: package 'GGally' was built under R version 4.0.5
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# 1. Generate Variable & EDA -------------------------------------------------------
# In this section, we will generate dummy dataset for our simulation
# We will assume that IFI consists of only two dimensions, namely dimension A and B
# Dimension A (first sub-index of IFI) also consists of three input indicators
# Dimension B (secind sub-index of IFI) consists of four input indicators
set.seed(21102021)
# create the variance covariance matrix
sigma_A<-rbind(c(1,0.8,0.7), c(0.8,1, 0.9), c(0.7,0.9,1))
sigma_B<-rbind(c(1,0.7,0.75,0.8), c(0.7,1,0.75,0.8), c(0.75,0.75,1,0.9),c(0.8,0.8,0.9,1))
# create the mean vector
mu_A<-c(10, 5, 2)
mu_B <- c(8.1,7.5,11,5)
# generate the multivariate normal distribution
data_A<-as.data.frame(mvrnorm(n=34, #34 observasi
mu=mu_A, Sigma=sigma_A)) %>%
arrange(V1) #data diurutkan dari kecil ke besar
data_A
## V1 V2 V3
## 1 8.060314 3.766230 0.62110750
## 2 8.349427 3.899478 0.40125260
## 3 8.801945 3.759563 1.49625976
## 4 8.841254 2.959679 -0.06285384
## 5 8.903298 4.762103 2.08006776
## 6 8.973170 4.346222 1.09117249
## 7 8.987169 4.184988 1.16776393
## 8 9.111682 4.706975 2.34477793
## 9 9.726779 5.192716 2.38876858
## 10 9.880644 5.683585 2.85084347
## 11 9.903996 3.845549 0.28860395
## 12 9.904142 4.521493 1.48171513
## 13 9.979606 4.861729 1.41177744
## 14 9.993027 5.023735 1.17678679
## 15 10.025249 5.278150 2.88734685
## 16 10.252577 4.248625 0.67272458
## 17 10.308330 4.390691 1.15688805
## 18 10.322231 4.784315 2.06228722
## 19 10.403886 5.682867 2.06884251
## 20 10.442857 6.076200 2.70715781
## 21 10.453826 4.527375 1.75529200
## 22 10.607456 5.975030 3.42520135
## 23 10.659481 4.253073 1.23570053
## 24 10.769230 4.985568 2.09222797
## 25 10.822491 5.655021 3.10350340
## 26 10.951511 5.131998 1.18064362
## 27 10.991703 4.742193 1.03097336
## 28 10.997492 5.772918 2.68863765
## 29 11.321746 5.066543 2.26418902
## 30 11.352267 5.644275 2.14751708
## 31 11.653416 5.647513 2.24260451
## 32 11.670272 6.196614 2.63570637
## 33 11.941981 5.615213 2.01506714
## 34 12.165017 7.145965 3.57822112
data_B<-as.data.frame(mvrnorm(n=34, mu=mu_B, Sigma=sigma_B)) %>%
arrange(V4)
data_B
## V1 V2 V3 V4
## 1 6.746662 5.475464 8.667404 1.985876
## 2 6.398852 6.307941 9.030900 2.817570
## 3 6.149399 7.687241 9.364629 2.900476
## 4 6.012812 7.949263 9.014755 3.220847
## 5 8.169693 7.418500 10.404226 3.776148
## 6 8.108144 6.387218 9.180112 3.893488
## 7 6.936631 6.020874 10.452149 4.077512
## 8 8.830227 7.718388 10.309080 4.163707
## 9 8.567042 6.960023 9.858251 4.221691
## 10 7.119075 7.068528 9.433241 4.234232
## 11 7.773713 6.705047 10.295575 4.235153
## 12 7.320785 5.986648 10.377317 4.746006
## 13 7.982613 8.288166 10.061391 4.784432
## 14 7.751169 7.896337 10.708707 4.865225
## 15 7.946412 8.051586 11.234394 4.876658
## 16 7.908821 8.748182 10.515884 4.886957
## 17 8.702276 6.675432 10.924299 4.887201
## 18 8.736747 7.916392 11.379632 5.030690
## 19 7.844875 7.984728 11.788480 5.073620
## 20 9.836384 7.214103 11.609466 5.124974
## 21 7.946139 8.103361 11.335065 5.176824
## 22 8.615952 7.847164 11.653299 5.410091
## 23 9.058074 7.810235 11.727991 5.412414
## 24 8.741835 8.212784 11.034568 5.436669
## 25 8.061654 8.094959 11.329944 5.527612
## 26 7.737040 7.543972 11.314776 5.667719
## 27 8.938332 7.984169 11.789868 5.872118
## 28 8.117245 6.678297 11.689216 6.070793
## 29 8.830796 8.678087 11.643901 6.236033
## 30 8.967668 8.641723 11.519852 6.337825
## 31 8.734360 9.332130 12.129725 6.364449
## 32 9.641468 8.668167 12.269787 6.495610
## 33 9.497850 7.894292 12.513680 6.547198
## 34 8.369085 8.241846 11.692097 6.629042
ggpairs(data_A)

ggpairs(data_B)

#Standard Normal Transform
data_A_std <- data_A %>%
mutate(V1 = (V1-mean(V1))/sd(V1),
V2 = (V2-mean(V2))/sd(V2),
V3 = (V3-mean(V3))/sd(V3))
data_B_std <- data_B %>%
mutate(V1 = (V1-mean(V1))/sd(V1),
V2 = (V2-mean(V2))/sd(V2),
V3 = (V3-mean(V3))/sd(V3),
V4 = (V4-mean(V4))/sd(V4))
#Transformation doesn't really change the pattern of the data: compare 2 graphs below
data_A %>% ggplot(aes(x=1:34,y=V1))+geom_line()

data_A_std %>% ggplot(aes(x=1:34,y=V1))+geom_line()

cov(data_A)
## V1 V2 V3
## V1 1.0642278 0.6577953 0.5009833
## V2 0.6577953 0.7410407 0.6827315
## V3 0.5009833 0.6827315 0.8283401
cov(data_A_std)
## V1 V2 V3
## V1 1.0000000 0.7407172 0.5335825
## V2 0.7407172 1.0000000 0.8714141
## V3 0.5335825 0.8714141 1.0000000
cor(data_A)
## V1 V2 V3
## V1 1.0000000 0.7407172 0.5335825
## V2 0.7407172 1.0000000 0.8714141
## V3 0.5335825 0.8714141 1.0000000
cor(data_A_std)
## V1 V2 V3
## V1 1.0000000 0.7407172 0.5335825
## V2 0.7407172 1.0000000 0.8714141
## V3 0.5335825 0.8714141 1.0000000
# First Stage -------------------------------------------------------------
#PCA Dimension A
Dimensi_A <- princomp(data_A_std,cor=F) #Since our data is already standardized, therefore we don't need to base our PCA calculation on correlation matrix
summary(Dimensi_A) #Eigen Value
## Importance of components:
## Comp.1 Comp.2 Comp.3
## Standard deviation 1.5386602 0.6814502 0.28269241
## Proportion of Variance 0.8130723 0.1594821 0.02744556
## Cumulative Proportion 0.8130723 0.9725544 1.00000000
Dimensi_A$sdev
## Comp.1 Comp.2 Comp.3
## 1.5386602 0.6814502 0.2826924
Dimensi_A$loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3
## V1 0.533 0.792 0.298
## V2 0.622 -0.128 -0.773
## V3 0.574 -0.597 0.561
##
## Comp.1 Comp.2 Comp.3
## SS loadings 1.000 1.000 1.000
## Proportion Var 0.333 0.333 0.333
## Cumulative Var 0.333 0.667 1.000
Dimensi_A
## Call:
## princomp(x = data_A_std, cor = F)
##
## Standard deviations:
## Comp.1 Comp.2 Comp.3
## 1.5386602 0.6814502 0.2826924
##
## 3 variables and 34 observations.
score_A <- (Dimensi_A$scores %*% (matrix(Dimensi_A$sdev^2)))/sum(Dimensi_A$sdev^2)
weight_A <- (Dimensi_A$loadings %*% matrix(Dimensi_A$sdev^2))/sum(Dimensi_A$sdev^2)
#score_A = the score of dimension A, which is the first sub-index of IFI
#weight_A = the weight attached to each input indicators that build dimension A
#PCA Dimension B
Dimensi_B <- princomp(data_B_std,cor=F)
summary(Dimensi_B) #Eigen Value
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 1.7239547 0.7571253 0.50909442 0.27913647
## Proportion of Variance 0.7655203 0.1476524 0.06675775 0.02006957
## Cumulative Proportion 0.7655203 0.9131727 0.97993043 1.00000000
Dimensi_B$sdev
## Comp.1 Comp.2 Comp.3 Comp.4
## 1.7239547 0.7571253 0.5090944 0.2791365
Dimensi_B$loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4
## V1 0.485 0.482 0.728
## V2 0.421 -0.857 0.290
## V3 0.540 0.171 -0.433 -0.701
## V4 0.543 -0.445 0.709
##
## Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings 1.00 1.00 1.00 1.00
## Proportion Var 0.25 0.25 0.25 0.25
## Cumulative Var 0.25 0.50 0.75 1.00
Dimensi_B
## Call:
## princomp(x = data_B_std, cor = F)
##
## Standard deviations:
## Comp.1 Comp.2 Comp.3 Comp.4
## 1.7239547 0.7571253 0.5090944 0.2791365
##
## 4 variables and 34 observations.
score_B <- (Dimensi_B$scores %*% (matrix(Dimensi_B$sdev^2)))/sum(Dimensi_B$sdev^2)
weight_B <- (Dimensi_B$loadings %*% matrix(Dimensi_B$sdev^2))/sum(Dimensi_B$sdev^2)
# Second Stage ------------------------------------------------------------
data_IFI <- data.frame(score_A,score_B)
data_IFI
## score_A score_B
## 1 -2.33516824 -3.10258009
## 2 -2.19771305 -2.64806683
## 3 -1.55857266 -2.29196662
## 4 -2.63086557 -2.32003084
## 5 -0.71427424 -0.58720166
## 6 -1.32037099 -1.29235102
## 7 -1.36700413 -1.43834072
## 8 -0.51677206 -0.06782368
## 9 0.10222908 -0.53817903
## 10 0.64790318 -1.42758104
## 11 -1.41914849 -0.84114981
## 12 -0.54753873 -1.03444706
## 13 -0.35236728 -0.24719595
## 14 -0.35756735 -0.18442923
## 15 0.52448850 0.16025524
## 16 -0.84677368 0.03491972
## 17 -0.53369322 0.11220465
## 18 0.07101375 0.65209427
## 19 0.60302095 0.37434702
## 20 1.10782356 1.18140456
## 21 -0.12555827 0.31878564
## 22 1.44913894 0.81354107
## 23 -0.38111065 1.06529989
## 24 0.43820481 0.73884774
## 25 1.25824358 0.50110744
## 26 0.22990015 0.24456392
## 27 -0.02169910 1.23289633
## 28 1.24172280 0.52515178
## 29 0.85901600 1.41676857
## 30 1.13758629 1.46862495
## 31 1.34548660 1.75460305
## 32 1.81782433 2.17123860
## 33 1.39015035 2.02370605
## 34 3.00244485 1.23098308
# Now that we already have the sub-index score which is score A and score B
# we will proceed to perform the second stage PCA using both score
# However, we need to standardize the obtained dimension scores first, just like in the first stage
data_IFI
## score_A score_B
## 1 -2.33516824 -3.10258009
## 2 -2.19771305 -2.64806683
## 3 -1.55857266 -2.29196662
## 4 -2.63086557 -2.32003084
## 5 -0.71427424 -0.58720166
## 6 -1.32037099 -1.29235102
## 7 -1.36700413 -1.43834072
## 8 -0.51677206 -0.06782368
## 9 0.10222908 -0.53817903
## 10 0.64790318 -1.42758104
## 11 -1.41914849 -0.84114981
## 12 -0.54753873 -1.03444706
## 13 -0.35236728 -0.24719595
## 14 -0.35756735 -0.18442923
## 15 0.52448850 0.16025524
## 16 -0.84677368 0.03491972
## 17 -0.53369322 0.11220465
## 18 0.07101375 0.65209427
## 19 0.60302095 0.37434702
## 20 1.10782356 1.18140456
## 21 -0.12555827 0.31878564
## 22 1.44913894 0.81354107
## 23 -0.38111065 1.06529989
## 24 0.43820481 0.73884774
## 25 1.25824358 0.50110744
## 26 0.22990015 0.24456392
## 27 -0.02169910 1.23289633
## 28 1.24172280 0.52515178
## 29 0.85901600 1.41676857
## 30 1.13758629 1.46862495
## 31 1.34548660 1.75460305
## 32 1.81782433 2.17123860
## 33 1.39015035 2.02370605
## 34 3.00244485 1.23098308
data_IFI_std <- data_IFI %>%
mutate(score_A = (score_A-mean(score_A))/sd(score_A),
score_B = (score_B-mean(score_B))/sd(score_B))
IFI <- princomp(data_IFI_std,cor=F)
summary(IFI) #Eigen Value
## Importance of components:
## Comp.1 Comp.2
## Standard deviation 1.3359346 0.3955441
## Proportion of Variance 0.9194019 0.0805981
## Cumulative Proportion 0.9194019 1.0000000
IFI$sdev
## Comp.1 Comp.2
## 1.3359346 0.3955441
IFI$loadings
##
## Loadings:
## Comp.1 Comp.2
## score_A 0.707 0.707
## score_B 0.707 -0.707
##
## Comp.1 Comp.2
## SS loadings 1.0 1.0
## Proportion Var 0.5 0.5
## Cumulative Var 0.5 1.0
score_IFI <- (IFI$scores %*% (matrix(IFI$sdev^2)))/sum(IFI$sdev^2)
weight_IFI <- (IFI$loadings %*% matrix(IFI$sdev^2))/sum(IFI$sdev^2)
# Viz ---------------------------------------------------------------------
# Finally, now that we have obtained IFI from previous calculations,
# we will use line chart to visualize the result
# however, to make it easier to analyze the dataa, we will transform the obtained score of each dimension and IFI to its normal CDF distribution form
# so that the value will range from 0-1
data_Viz <- data.frame(dim_A = pnorm(score_A, mean=mean(score_A), sd=sd(score_A)),
dim_B = pnorm(score_B, mean=mean(score_B), sd=sd(score_B)),
IFI = pnorm(score_IFI, mean=mean(score_IFI),
sd=sd(score_IFI)))
ggplot(data_Viz, aes(x=1:34)) +
geom_line(aes(y = dim_A, col = "red")) +
geom_line(aes(y = dim_B, col = "green")) +
geom_line(aes(y = IFI, col = "blue"))
